Skip to content

Commit 7b502e8

Browse files
author
Herton R. Krzesinski
committed
Merge: xfs, iomap: fix data corrupton due to stale cached iomap
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/2022 Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2155605 Tested: With xfstests and bz reproducer This series has as its main goal to fix a data corruption resulted of a race between unaligned buffered writes with low memory conditions causing both writeback and memory reclaim to race with the writes. By fixing the race itself, a few other issues were uncovered, so the series not only fixes the race described above, but also a race in xfs itself, resulted from truncating page cache and punching delalloc extents without synchronization. Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com> Approved-by: Bill O'Donnell <bodonnel@redhat.com> Approved-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Herton R. Krzesinski <herton@redhat.com>
2 parents d581fa2 + ad48080 commit 7b502e8

File tree

13 files changed

+469
-113
lines changed

13 files changed

+469
-113
lines changed

fs/iomap/buffered-io.c

Lines changed: 254 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,7 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
590590
return iomap_read_inline_data(iter, page);
591591
}
592592

593-
static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
593+
static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
594594
unsigned len, struct page **pagep)
595595
{
596596
const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
@@ -618,6 +618,26 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
618618
goto out_no_page;
619619
}
620620

621+
/*
622+
* Now we have a locked folio, before we do anything with it we need to
623+
* check that the iomap we have cached is not stale. The inode extent
624+
* mapping can change due to concurrent IO in flight (e.g.
625+
* IOMAP_UNWRITTEN state can change and memory reclaim could have
626+
* reclaimed a previously partially written page at this index after IO
627+
* completion before this write reaches this file offset) and hence we
628+
* could do the wrong thing here (zero a page range incorrectly or fail
629+
* to zero) and corrupt data.
630+
*/
631+
if (page_ops && page_ops->iomap_valid) {
632+
bool iomap_valid = page_ops->iomap_valid(iter->inode,
633+
&iter->iomap);
634+
if (!iomap_valid) {
635+
iter->iomap.flags |= IOMAP_F_STALE;
636+
status = 0;
637+
goto out_unlock;
638+
}
639+
}
640+
621641
if (srcmap->type == IOMAP_INLINE)
622642
status = iomap_write_begin_inline(iter, page);
623643
else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
@@ -757,6 +777,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
757777
status = iomap_write_begin(iter, pos, bytes, &page);
758778
if (unlikely(status))
759779
break;
780+
if (iter->iomap.flags & IOMAP_F_STALE)
781+
break;
760782

761783
if (mapping_writably_mapped(iter->inode->i_mapping))
762784
flush_dcache_page(page);
@@ -810,6 +832,231 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
810832
}
811833
EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
812834

835+
/*
836+
* Scan the data range passed to us for dirty page cache folios. If we find a
837+
* dirty folio, punch out the preceeding range and update the offset from which
838+
* the next punch will start from.
839+
*
840+
* We can punch out storage reservations under clean pages because they either
841+
* contain data that has been written back - in which case the delalloc punch
842+
* over that range is a no-op - or they have been read faults in which case they
843+
* contain zeroes and we can remove the delalloc backing range and any new
844+
* writes to those pages will do the normal hole filling operation...
845+
*
846+
* This makes the logic simple: we only need to keep the delalloc extents only
847+
* over the dirty ranges of the page cache.
848+
*
849+
* This function uses [start_byte, end_byte) intervals (i.e. open ended) to
850+
* simplify range iterations.
851+
*/
852+
static int iomap_write_delalloc_scan(struct inode *inode,
853+
loff_t *punch_start_byte, loff_t start_byte, loff_t end_byte,
854+
int (*punch)(struct inode *inode, loff_t offset, loff_t length))
855+
{
856+
while (start_byte < end_byte) {
857+
struct folio *folio;
858+
859+
/* grab locked page */
860+
folio = filemap_lock_folio(inode->i_mapping,
861+
start_byte >> PAGE_SHIFT);
862+
if (!folio) {
863+
start_byte = ALIGN_DOWN(start_byte, PAGE_SIZE) +
864+
PAGE_SIZE;
865+
continue;
866+
}
867+
868+
/* if dirty, punch up to offset */
869+
if (folio_test_dirty(folio)) {
870+
if (start_byte > *punch_start_byte) {
871+
int error;
872+
873+
error = punch(inode, *punch_start_byte,
874+
start_byte - *punch_start_byte);
875+
if (error) {
876+
folio_unlock(folio);
877+
folio_put(folio);
878+
return error;
879+
}
880+
}
881+
882+
/*
883+
* Make sure the next punch start is correctly bound to
884+
* the end of this data range, not the end of the folio.
885+
*/
886+
*punch_start_byte = min_t(loff_t, end_byte,
887+
folio_next_index(folio) << PAGE_SHIFT);
888+
}
889+
890+
/* move offset to start of next folio in range */
891+
start_byte = folio_next_index(folio) << PAGE_SHIFT;
892+
folio_unlock(folio);
893+
folio_put(folio);
894+
}
895+
return 0;
896+
}
897+
898+
/*
899+
* Punch out all the delalloc blocks in the range given except for those that
900+
* have dirty data still pending in the page cache - those are going to be
901+
* written and so must still retain the delalloc backing for writeback.
902+
*
903+
* As we are scanning the page cache for data, we don't need to reimplement the
904+
* wheel - mapping_seek_hole_data() does exactly what we need to identify the
905+
* start and end of data ranges correctly even for sub-folio block sizes. This
906+
* byte range based iteration is especially convenient because it means we
907+
* don't have to care about variable size folios, nor where the start or end of
908+
* the data range lies within a folio, if they lie within the same folio or even
909+
* if there are multiple discontiguous data ranges within the folio.
910+
*
911+
* It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
912+
* can return data ranges that exist in the cache beyond EOF. e.g. a page fault
913+
* spanning EOF will initialise the post-EOF data to zeroes and mark it up to
914+
* date. A write page fault can then mark it dirty. If we then fail a write()
915+
* beyond EOF into that up to date cached range, we allocate a delalloc block
916+
* beyond EOF and then have to punch it out. Because the range is up to date,
917+
* mapping_seek_hole_data() will return it, and we will skip the punch because
918+
* the folio is dirty. THis is incorrect - we always need to punch out delalloc
919+
* beyond EOF in this case as writeback will never write back and covert that
920+
* delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
921+
* resulting in always punching out the range from the EOF to the end of the
922+
* range the iomap spans.
923+
*
924+
* Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
925+
* matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
926+
* returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
927+
* returns the end of the data range (data_end). Using closed intervals would
928+
* require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
929+
* the code to subtle off-by-one bugs....
930+
*/
931+
static int iomap_write_delalloc_release(struct inode *inode,
932+
loff_t start_byte, loff_t end_byte,
933+
int (*punch)(struct inode *inode, loff_t pos, loff_t length))
934+
{
935+
loff_t punch_start_byte = start_byte;
936+
loff_t scan_end_byte = min(i_size_read(inode), end_byte);
937+
int error = 0;
938+
939+
/*
940+
* Lock the mapping to avoid races with page faults re-instantiating
941+
* folios and dirtying them via ->page_mkwrite whilst we walk the
942+
* cache and perform delalloc extent removal. Failing to do this can
943+
* leave dirty pages with no space reservation in the cache.
944+
*/
945+
filemap_invalidate_lock(inode->i_mapping);
946+
while (start_byte < scan_end_byte) {
947+
loff_t data_end;
948+
949+
start_byte = mapping_seek_hole_data(inode->i_mapping,
950+
start_byte, scan_end_byte, SEEK_DATA);
951+
/*
952+
* If there is no more data to scan, all that is left is to
953+
* punch out the remaining range.
954+
*/
955+
if (start_byte == -ENXIO || start_byte == scan_end_byte)
956+
break;
957+
if (start_byte < 0) {
958+
error = start_byte;
959+
goto out_unlock;
960+
}
961+
WARN_ON_ONCE(start_byte < punch_start_byte);
962+
WARN_ON_ONCE(start_byte > scan_end_byte);
963+
964+
/*
965+
* We find the end of this contiguous cached data range by
966+
* seeking from start_byte to the beginning of the next hole.
967+
*/
968+
data_end = mapping_seek_hole_data(inode->i_mapping, start_byte,
969+
scan_end_byte, SEEK_HOLE);
970+
if (data_end < 0) {
971+
error = data_end;
972+
goto out_unlock;
973+
}
974+
WARN_ON_ONCE(data_end <= start_byte);
975+
WARN_ON_ONCE(data_end > scan_end_byte);
976+
977+
error = iomap_write_delalloc_scan(inode, &punch_start_byte,
978+
start_byte, data_end, punch);
979+
if (error)
980+
goto out_unlock;
981+
982+
/* The next data search starts at the end of this one. */
983+
start_byte = data_end;
984+
}
985+
986+
if (punch_start_byte < end_byte)
987+
error = punch(inode, punch_start_byte,
988+
end_byte - punch_start_byte);
989+
out_unlock:
990+
filemap_invalidate_unlock(inode->i_mapping);
991+
return error;
992+
}
993+
994+
/*
995+
* When a short write occurs, the filesystem may need to remove reserved space
996+
* that was allocated in ->iomap_begin from it's ->iomap_end method. For
997+
* filesystems that use delayed allocation, we need to punch out delalloc
998+
* extents from the range that are not dirty in the page cache. As the write can
999+
* race with page faults, there can be dirty pages over the delalloc extent
1000+
* outside the range of a short write but still within the delalloc extent
1001+
* allocated for this iomap.
1002+
*
1003+
* This function uses [start_byte, end_byte) intervals (i.e. open ended) to
1004+
* simplify range iterations.
1005+
*
1006+
* The punch() callback *must* only punch delalloc extents in the range passed
1007+
* to it. It must skip over all other types of extents in the range and leave
1008+
* them completely unchanged. It must do this punch atomically with respect to
1009+
* other extent modifications.
1010+
*
1011+
* The punch() callback may be called with a folio locked to prevent writeback
1012+
* extent allocation racing at the edge of the range we are currently punching.
1013+
* The locked folio may or may not cover the range being punched, so it is not
1014+
* safe for the punch() callback to lock folios itself.
1015+
*
1016+
* Lock order is:
1017+
*
1018+
* inode->i_rwsem (shared or exclusive)
1019+
* inode->i_mapping->invalidate_lock (exclusive)
1020+
* folio_lock()
1021+
* ->punch
1022+
* internal filesystem allocation lock
1023+
*/
1024+
int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
1025+
struct iomap *iomap, loff_t pos, loff_t length,
1026+
ssize_t written,
1027+
int (*punch)(struct inode *inode, loff_t pos, loff_t length))
1028+
{
1029+
loff_t start_byte;
1030+
loff_t end_byte;
1031+
int blocksize = i_blocksize(inode);
1032+
1033+
if (iomap->type != IOMAP_DELALLOC)
1034+
return 0;
1035+
1036+
/* If we didn't reserve the blocks, we're not allowed to punch them. */
1037+
if (!(iomap->flags & IOMAP_F_NEW))
1038+
return 0;
1039+
1040+
/*
1041+
* start_byte refers to the first unused block after a short write. If
1042+
* nothing was written, round offset down to point at the first block in
1043+
* the range.
1044+
*/
1045+
if (unlikely(!written))
1046+
start_byte = round_down(pos, blocksize);
1047+
else
1048+
start_byte = round_up(pos + written, blocksize);
1049+
end_byte = round_up(pos + length, blocksize);
1050+
1051+
/* Nothing to do if we've written the entire delalloc extent */
1052+
if (start_byte >= end_byte)
1053+
return 0;
1054+
1055+
return iomap_write_delalloc_release(inode, start_byte, end_byte,
1056+
punch);
1057+
}
1058+
EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc);
1059+
8131060
static loff_t iomap_unshare_iter(struct iomap_iter *iter)
8141061
{
8151062
struct iomap *iomap = &iter->iomap;
@@ -834,6 +1081,8 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
8341081
status = iomap_write_begin(iter, pos, bytes, &page);
8351082
if (unlikely(status))
8361083
return status;
1084+
if (iter->iomap.flags & IOMAP_F_STALE)
1085+
break;
8371086

8381087
status = iomap_write_end(iter, pos, bytes, bytes, page);
8391088
if (WARN_ON_ONCE(status == 0))
@@ -879,6 +1128,8 @@ static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length)
8791128
status = iomap_write_begin(iter, pos, bytes, &page);
8801129
if (status)
8811130
return status;
1131+
if (iter->iomap.flags & IOMAP_F_STALE)
1132+
return status;
8821133

8831134
zero_user(page, offset, bytes);
8841135
mark_page_accessed(page);
@@ -907,6 +1158,8 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
9071158
bytes = __iomap_zero_iter(iter, pos, length);
9081159
if (bytes < 0)
9091160
return bytes;
1161+
if (!bytes && iter->iomap.flags & IOMAP_F_STALE)
1162+
break;
9101163

9111164
pos += bytes;
9121165
length -= bytes;

fs/iomap/iter.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,28 @@
77
#include <linux/iomap.h>
88
#include "trace.h"
99

10+
/*
11+
* Advance to the next range we need to map.
12+
*
13+
* If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
14+
* processed - it was aborted because the extent the iomap spanned may have been
15+
* changed during the operation. In this case, the iteration behaviour is to
16+
* remap the unprocessed range of the iter, and that means we may need to remap
17+
* even when we've made no progress (i.e. iter->processed = 0). Hence the
18+
* "finished iterating" case needs to distinguish between
19+
* (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
20+
* need to remap the entire remaining range.
21+
*/
1022
static inline int iomap_iter_advance(struct iomap_iter *iter)
1123
{
24+
bool stale = iter->iomap.flags & IOMAP_F_STALE;
25+
1226
/* handle the previous iteration (if any) */
1327
if (iter->iomap.length) {
14-
if (iter->processed <= 0)
28+
if (iter->processed < 0)
1529
return iter->processed;
30+
if (!iter->processed && !stale)
31+
return 0;
1632
if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
1733
return -EIO;
1834
iter->pos += iter->processed;
@@ -33,6 +49,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
3349
WARN_ON_ONCE(iter->iomap.offset > iter->pos);
3450
WARN_ON_ONCE(iter->iomap.length == 0);
3551
WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
52+
WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
3653

3754
trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
3855
if (iter->srcmap.type != IOMAP_HOLE)

fs/xfs/libxfs/xfs_bmap.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4551,7 +4551,8 @@ xfs_bmapi_convert_delalloc(
45514551
* the extent. Just return the real extent at this offset.
45524552
*/
45534553
if (!isnullstartblock(bma.got.br_startblock)) {
4554-
xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
4554+
xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags,
4555+
xfs_iomap_inode_sequence(ip, flags));
45554556
*seq = READ_ONCE(ifp->if_seq);
45564557
goto out_trans_cancel;
45574558
}
@@ -4598,7 +4599,8 @@ xfs_bmapi_convert_delalloc(
45984599
XFS_STATS_INC(mp, xs_xstrat_quick);
45994600

46004601
ASSERT(!isnullstartblock(bma.got.br_startblock));
4601-
xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags);
4602+
xfs_bmbt_to_iomap(ip, iomap, &bma.got, flags,
4603+
xfs_iomap_inode_sequence(ip, flags));
46024604
*seq = READ_ONCE(ifp->if_seq);
46034605

46044606
if (whichfork == XFS_COW_FORK)

fs/xfs/libxfs/xfs_errortag.h

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,12 @@
4040
#define XFS_ERRTAG_REFCOUNT_FINISH_ONE 25
4141
#define XFS_ERRTAG_BMAP_FINISH_ONE 26
4242
#define XFS_ERRTAG_AG_RESV_CRITICAL 27
43+
4344
/*
44-
* DEBUG mode instrumentation to test and/or trigger delayed allocation
45-
* block killing in the event of failed writes. When enabled, all
46-
* buffered writes are silenty dropped and handled as if they failed.
47-
* All delalloc blocks in the range of the write (including pre-existing
48-
* delalloc blocks!) are tossed as part of the write failure error
49-
* handling sequence.
45+
* Drop-writes support removed because write error handling cannot trash
46+
* pre-existing delalloc extents in any useful way anymore. We retain the
47+
* definition so that we can reject it as an invalid value in
48+
* xfs_errortag_valid().
5049
*/
5150
#define XFS_ERRTAG_DROP_WRITES 28
5251
#define XFS_ERRTAG_LOG_BAD_CRC 29
@@ -92,7 +91,6 @@
9291
#define XFS_RANDOM_REFCOUNT_FINISH_ONE 1
9392
#define XFS_RANDOM_BMAP_FINISH_ONE 1
9493
#define XFS_RANDOM_AG_RESV_CRITICAL 4
95-
#define XFS_RANDOM_DROP_WRITES 1
9694
#define XFS_RANDOM_LOG_BAD_CRC 1
9795
#define XFS_RANDOM_LOG_ITEM_PIN 1
9896
#define XFS_RANDOM_BUF_LRU_REF 2

0 commit comments

Comments
 (0)