11// SPDX-License-Identifier: GPL-2.0
22/*
3- * Copyright (C) 2010 Red Hat, Inc.
3+ * Copyright (C) 2010, 2023 Red Hat, Inc.
44 * All Rights Reserved.
55 */
66#include "xfs.h"
1919#include "xfs_log.h"
2020#include "xfs_ag.h"
2121
22- STATIC int
23- xfs_trim_extents (
22+ /*
23+ * Notes on an efficient, low latency fstrim algorithm
24+ *
25+ * We need to walk the filesystem free space and issue discards on the free
26+ * space that meet the search criteria (size and location). We cannot issue
27+ * discards on extents that might be in use, or are so recently in use they are
28+ * still marked as busy. To serialise against extent state changes whilst we are
29+ * gathering extents to trim, we must hold the AGF lock to lock out other
30+ * allocations and extent free operations that might change extent state.
31+ *
32+ * However, we cannot just hold the AGF for the entire AG free space walk whilst
33+ * we issue discards on each free space that is found. Storage devices can have
34+ * extremely slow discard implementations (e.g. ceph RBD) and so walking a
35+ * couple of million free extents and issuing synchronous discards on each
36+ * extent can take a *long* time. Whilst we are doing this walk, nothing else
37+ * can access the AGF, and we can stall transactions and hence the log whilst
38+ * modifications wait for the AGF lock to be released. This can lead hung tasks
39+ * kicking the hung task timer and rebooting the system. This is bad.
40+ *
41+ * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI
42+ * lock, gathers a range of inode cluster buffers that are allocated, drops the
43+ * AGI lock and then reads all the inode cluster buffers and processes them. It
44+ * loops doing this, using a cursor to keep track of where it is up to in the AG
45+ * for each iteration to restart the INOBT lookup from.
46+ *
47+ * We can't do this exactly with free space - once we drop the AGF lock, the
48+ * state of the free extent is out of our control and we cannot run a discard
49+ * safely on it in this situation. Unless, of course, we've marked the free
50+ * extent as busy and undergoing a discard operation whilst we held the AGF
51+ * locked.
52+ *
53+ * This is exactly how online discard works - free extents are marked busy when
54+ * they are freed, and once the extent free has been committed to the journal,
55+ * the busy extent record is marked as "undergoing discard" and the discard is
56+ * then issued on the free extent. Once the discard completes, the busy extent
57+ * record is removed and the extent is able to be allocated again.
58+ *
59+ * In the context of fstrim, if we find a free extent we need to discard, we
60+ * don't have to discard it immediately. All we need to do it record that free
61+ * extent as being busy and under discard, and all the allocation routines will
62+ * now avoid trying to allocate it. Hence if we mark the extent as busy under
63+ * the AGF lock, we can safely discard it without holding the AGF lock because
64+ * nothing will attempt to allocate that free space until the discard completes.
65+ *
66+ * This also allows us to issue discards asynchronously like we do with online
67+ * discard, and so for fast devices fstrim will run much faster as we can have
68+ * multiple discard operations in flight at once, as well as pipeline the free
69+ * extent search so that it overlaps in flight discard IO.
70+ */
71+
72+ struct workqueue_struct * xfs_discard_wq ;
73+
74+ static void
75+ xfs_discard_endio_work (
76+ struct work_struct * work )
77+ {
78+ struct xfs_busy_extents * extents =
79+ container_of (work , struct xfs_busy_extents , endio_work );
80+
81+ xfs_extent_busy_clear (extents -> mount , & extents -> extent_list , false);
82+ kmem_free (extents -> owner );
83+ }
84+
85+ /*
86+ * Queue up the actual completion to a thread to avoid IRQ-safe locking for
87+ * pagb_lock.
88+ */
89+ static void
90+ xfs_discard_endio (
91+ struct bio * bio )
92+ {
93+ struct xfs_busy_extents * extents = bio -> bi_private ;
94+
95+ INIT_WORK (& extents -> endio_work , xfs_discard_endio_work );
96+ queue_work (xfs_discard_wq , & extents -> endio_work );
97+ bio_put (bio );
98+ }
99+
100+ /*
101+ * Walk the discard list and issue discards on all the busy extents in the
102+ * list. We plug and chain the bios so that we only need a single completion
103+ * call to clear all the busy extents once the discards are complete.
104+ */
105+ int
106+ xfs_discard_extents (
107+ struct xfs_mount * mp ,
108+ struct xfs_busy_extents * extents )
109+ {
110+ struct xfs_extent_busy * busyp ;
111+ struct bio * bio = NULL ;
112+ struct blk_plug plug ;
113+ int error = 0 ;
114+
115+ blk_start_plug (& plug );
116+ list_for_each_entry (busyp , & extents -> extent_list , list ) {
117+ trace_xfs_discard_extent (mp , busyp -> agno , busyp -> bno ,
118+ busyp -> length );
119+
120+ error = __blkdev_issue_discard (mp -> m_ddev_targp -> bt_bdev ,
121+ XFS_AGB_TO_DADDR (mp , busyp -> agno , busyp -> bno ),
122+ XFS_FSB_TO_BB (mp , busyp -> length ),
123+ GFP_NOFS , & bio );
124+ if (error && error != - EOPNOTSUPP ) {
125+ xfs_info (mp ,
126+ "discard failed for extent [0x%llx,%u], error %d" ,
127+ (unsigned long long )busyp -> bno ,
128+ busyp -> length ,
129+ error );
130+ break ;
131+ }
132+ }
133+
134+ if (bio ) {
135+ bio -> bi_private = extents ;
136+ bio -> bi_end_io = xfs_discard_endio ;
137+ submit_bio (bio );
138+ } else {
139+ xfs_discard_endio_work (& extents -> endio_work );
140+ }
141+ blk_finish_plug (& plug );
142+
143+ return error ;
144+ }
145+
146+
147+ static int
148+ xfs_trim_gather_extents (
24149 struct xfs_perag * pag ,
25150 xfs_daddr_t start ,
26151 xfs_daddr_t end ,
27152 xfs_daddr_t minlen ,
153+ struct xfs_alloc_rec_incore * tcur ,
154+ struct xfs_busy_extents * extents ,
28155 uint64_t * blocks_trimmed )
29156{
30157 struct xfs_mount * mp = pag -> pag_mount ;
31- struct block_device * bdev = mp -> m_ddev_targp -> bt_bdev ;
32158 struct xfs_btree_cur * cur ;
33159 struct xfs_buf * agbp ;
34- struct xfs_agf * agf ;
35160 int error ;
36161 int i ;
162+ int batch = 100 ;
37163
38164 /*
39165 * Force out the log. This means any transactions that might have freed
@@ -45,20 +171,28 @@ xfs_trim_extents(
45171 error = xfs_alloc_read_agf (pag , NULL , 0 , & agbp );
46172 if (error )
47173 return error ;
48- agf = agbp -> b_addr ;
49174
50175 cur = xfs_allocbt_init_cursor (mp , NULL , agbp , pag , XFS_BTNUM_CNT );
51176
52177 /*
53- * Look up the longest btree in the AGF and start with it.
178+ * Look up the extent length requested in the AGF and start with it.
54179 */
55- error = xfs_alloc_lookup_ge (cur , 0 , be32_to_cpu (agf -> agf_longest ), & i );
180+ if (tcur -> ar_startblock == NULLAGBLOCK )
181+ error = xfs_alloc_lookup_ge (cur , 0 , tcur -> ar_blockcount , & i );
182+ else
183+ error = xfs_alloc_lookup_le (cur , tcur -> ar_startblock ,
184+ tcur -> ar_blockcount , & i );
56185 if (error )
57186 goto out_del_cursor ;
187+ if (i == 0 ) {
188+ /* nothing of that length left in the AG, we are done */
189+ tcur -> ar_blockcount = 0 ;
190+ goto out_del_cursor ;
191+ }
58192
59193 /*
60194 * Loop until we are done with all extents that are large
61- * enough to be worth discarding.
195+ * enough to be worth discarding or we hit batch limits .
62196 */
63197 while (i ) {
64198 xfs_agblock_t fbno ;
@@ -73,7 +207,16 @@ xfs_trim_extents(
73207 error = - EFSCORRUPTED ;
74208 break ;
75209 }
76- ASSERT (flen <= be32_to_cpu (agf -> agf_longest ));
210+
211+ if (-- batch <= 0 ) {
212+ /*
213+ * Update the cursor to point at this extent so we
214+ * restart the next batch from this extent.
215+ */
216+ tcur -> ar_startblock = fbno ;
217+ tcur -> ar_blockcount = flen ;
218+ break ;
219+ }
77220
78221 /*
79222 * use daddr format for all range/len calculations as that is
@@ -88,6 +231,7 @@ xfs_trim_extents(
88231 */
89232 if (dlen < minlen ) {
90233 trace_xfs_discard_toosmall (mp , pag -> pag_agno , fbno , flen );
234+ tcur -> ar_blockcount = 0 ;
91235 break ;
92236 }
93237
@@ -110,29 +254,103 @@ xfs_trim_extents(
110254 goto next_extent ;
111255 }
112256
113- trace_xfs_discard_extent (mp , pag -> pag_agno , fbno , flen );
114- error = blkdev_issue_discard (bdev , dbno , dlen , GFP_NOFS );
115- if (error )
116- break ;
257+ xfs_extent_busy_insert_discard (pag , fbno , flen ,
258+ & extents -> extent_list );
117259 * blocks_trimmed += flen ;
118-
119260next_extent :
120261 error = xfs_btree_decrement (cur , 0 , & i );
121262 if (error )
122263 break ;
123264
124- if (fatal_signal_pending (current )) {
125- error = - ERESTARTSYS ;
126- break ;
127- }
265+ /*
266+ * If there's no more records in the tree, we are done. Set the
267+ * cursor block count to 0 to indicate to the caller that there
268+ * is no more extents to search.
269+ */
270+ if (i == 0 )
271+ tcur -> ar_blockcount = 0 ;
128272 }
129273
274+ /*
275+ * If there was an error, release all the gathered busy extents because
276+ * we aren't going to issue a discard on them any more.
277+ */
278+ if (error )
279+ xfs_extent_busy_clear (mp , & extents -> extent_list , false);
130280out_del_cursor :
131281 xfs_btree_del_cursor (cur , error );
132282 xfs_buf_relse (agbp );
133283 return error ;
134284}
135285
286+ static bool
287+ xfs_trim_should_stop (void )
288+ {
289+ return fatal_signal_pending (current ) || freezing (current );
290+ }
291+
292+ /*
293+ * Iterate the free list gathering extents and discarding them. We need a cursor
294+ * for the repeated iteration of gather/discard loop, so use the longest extent
295+ * we found in the last batch as the key to start the next.
296+ */
297+ static int
298+ xfs_trim_extents (
299+ struct xfs_perag * pag ,
300+ xfs_daddr_t start ,
301+ xfs_daddr_t end ,
302+ xfs_daddr_t minlen ,
303+ uint64_t * blocks_trimmed )
304+ {
305+ struct xfs_alloc_rec_incore tcur = {
306+ .ar_blockcount = pag -> pagf_longest ,
307+ .ar_startblock = NULLAGBLOCK ,
308+ };
309+ int error = 0 ;
310+
311+ do {
312+ struct xfs_busy_extents * extents ;
313+
314+ extents = kzalloc (sizeof (* extents ), GFP_KERNEL );
315+ if (!extents ) {
316+ error = - ENOMEM ;
317+ break ;
318+ }
319+
320+ extents -> mount = pag -> pag_mount ;
321+ extents -> owner = extents ;
322+ INIT_LIST_HEAD (& extents -> extent_list );
323+
324+ error = xfs_trim_gather_extents (pag , start , end , minlen ,
325+ & tcur , extents , blocks_trimmed );
326+ if (error ) {
327+ kfree (extents );
328+ break ;
329+ }
330+
331+ /*
332+ * We hand the extent list to the discard function here so the
333+ * discarded extents can be removed from the busy extent list.
334+ * This allows the discards to run asynchronously with gathering
335+ * the next round of extents to discard.
336+ *
337+ * However, we must ensure that we do not reference the extent
338+ * list after this function call, as it may have been freed by
339+ * the time control returns to us.
340+ */
341+ error = xfs_discard_extents (pag -> pag_mount , extents );
342+ if (error )
343+ break ;
344+
345+ if (xfs_trim_should_stop ())
346+ break ;
347+
348+ } while (tcur .ar_blockcount != 0 );
349+
350+ return error ;
351+
352+ }
353+
136354/*
137355 * trim a range of the filesystem.
138356 *
@@ -195,12 +413,12 @@ xfs_ioc_trim(
195413 for_each_perag_range (mp , agno , xfs_daddr_to_agno (mp , end ), pag ) {
196414 error = xfs_trim_extents (pag , start , end , minlen ,
197415 & blocks_trimmed );
198- if (error ) {
416+ if (error )
199417 last_error = error ;
200- if ( error == - ERESTARTSYS ) {
201- xfs_perag_rele ( pag );
202- break ;
203- }
418+
419+ if ( xfs_trim_should_stop ()) {
420+ xfs_perag_rele ( pag ) ;
421+ break ;
204422 }
205423 }
206424
0 commit comments