Skip to content

Commit 5fe3d30

Browse files
committed
iomap: write iomap validity checks
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=2155605 Tested: With xfstests and bz reproducer Conflicts: - iomap_zero_iter() logic is a bit different from upstream - Context A recent multithreaded write data corruption has been uncovered in the iomap write code. The core of the problem is partial folio writes can be flushed to disk while a new racing write can map it and fill the rest of the page: writeback new write allocate blocks blocks are unwritten submit IO ..... map blocks iomap indicates UNWRITTEN range loop { lock folio copyin data ..... IO completes runs unwritten extent conv blocks are marked written <iomap now stale> get next folio } Now add memory pressure such that memory reclaim evicts the partially written folio that has already been written to disk. When the new write finally gets to the last partial page of the new write, it does not find it in cache, so it instantiates a new page, sees the iomap is unwritten, and zeros the part of the page that it does not have data from. This overwrites the data on disk that was originally written. The full description of the corruption mechanism can be found here: https://lore.kernel.org/linux-xfs/20220817093627.GZ3600936@dread.disaster.area/ To solve this problem, we need to check whether the iomap is still valid after we lock each folio during the write. We have to do it after we lock the page so that we don't end up with state changes occurring while we wait for the folio to be locked. Hence we need a mechanism to be able to check that the cached iomap is still valid (similar to what we already do in buffered writeback), and we need a way for ->begin_write to back out and tell the high level iomap iterator that we need to remap the remaining write range. The iomap needs to grow some storage for the validity cookie that the filesystem provides to travel with the iomap. XFS, in particular, also needs to know some more information about what the iomap maps (attribute extents rather than file data extents) to for the validity cookie to cover all the types of iomaps we might need to validate. Signed-off-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Darrick J. Wong <djwong@kernel.org> (cherry picked from commit d7b6404) Signed-off-by: Carlos Maiolino <cmaiolino@redhat.com>
1 parent 48ccb79 commit 5fe3d30

File tree

3 files changed

+82
-10
lines changed

3 files changed

+82
-10
lines changed

fs/iomap/buffered-io.c

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,7 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
590590
return iomap_read_inline_data(iter, page);
591591
}
592592

593-
static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
593+
static int iomap_write_begin(struct iomap_iter *iter, loff_t pos,
594594
unsigned len, struct page **pagep)
595595
{
596596
const struct iomap_page_ops *page_ops = iter->iomap.page_ops;
@@ -618,6 +618,26 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
618618
goto out_no_page;
619619
}
620620

621+
/*
622+
* Now we have a locked folio, before we do anything with it we need to
623+
* check that the iomap we have cached is not stale. The inode extent
624+
* mapping can change due to concurrent IO in flight (e.g.
625+
* IOMAP_UNWRITTEN state can change and memory reclaim could have
626+
* reclaimed a previously partially written page at this index after IO
627+
* completion before this write reaches this file offset) and hence we
628+
* could do the wrong thing here (zero a page range incorrectly or fail
629+
* to zero) and corrupt data.
630+
*/
631+
if (page_ops && page_ops->iomap_valid) {
632+
bool iomap_valid = page_ops->iomap_valid(iter->inode,
633+
&iter->iomap);
634+
if (!iomap_valid) {
635+
iter->iomap.flags |= IOMAP_F_STALE;
636+
status = 0;
637+
goto out_unlock;
638+
}
639+
}
640+
621641
if (srcmap->type == IOMAP_INLINE)
622642
status = iomap_write_begin_inline(iter, page);
623643
else if (srcmap->flags & IOMAP_F_BUFFER_HEAD)
@@ -757,6 +777,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
757777
status = iomap_write_begin(iter, pos, bytes, &page);
758778
if (unlikely(status))
759779
break;
780+
if (iter->iomap.flags & IOMAP_F_STALE)
781+
break;
760782

761783
if (mapping_writably_mapped(iter->inode->i_mapping))
762784
flush_dcache_page(page);
@@ -1059,6 +1081,8 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
10591081
status = iomap_write_begin(iter, pos, bytes, &page);
10601082
if (unlikely(status))
10611083
return status;
1084+
if (iter->iomap.flags & IOMAP_F_STALE)
1085+
break;
10621086

10631087
status = iomap_write_end(iter, pos, bytes, bytes, page);
10641088
if (WARN_ON_ONCE(status == 0))
@@ -1104,6 +1128,8 @@ static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length)
11041128
status = iomap_write_begin(iter, pos, bytes, &page);
11051129
if (status)
11061130
return status;
1131+
if (iter->iomap.flags & IOMAP_F_STALE)
1132+
return status;
11071133

11081134
zero_user(page, offset, bytes);
11091135
mark_page_accessed(page);
@@ -1132,6 +1158,8 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
11321158
bytes = __iomap_zero_iter(iter, pos, length);
11331159
if (bytes < 0)
11341160
return bytes;
1161+
if (!bytes && iter->iomap.flags & IOMAP_F_STALE)
1162+
break;
11351163

11361164
pos += bytes;
11371165
length -= bytes;

fs/iomap/iter.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,28 @@
77
#include <linux/iomap.h>
88
#include "trace.h"
99

10+
/*
11+
* Advance to the next range we need to map.
12+
*
13+
* If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully
14+
* processed - it was aborted because the extent the iomap spanned may have been
15+
* changed during the operation. In this case, the iteration behaviour is to
16+
* remap the unprocessed range of the iter, and that means we may need to remap
17+
* even when we've made no progress (i.e. iter->processed = 0). Hence the
18+
* "finished iterating" case needs to distinguish between
19+
* (processed = 0) meaning we are done and (processed = 0 && stale) meaning we
20+
* need to remap the entire remaining range.
21+
*/
1022
static inline int iomap_iter_advance(struct iomap_iter *iter)
1123
{
24+
bool stale = iter->iomap.flags & IOMAP_F_STALE;
25+
1226
/* handle the previous iteration (if any) */
1327
if (iter->iomap.length) {
14-
if (iter->processed <= 0)
28+
if (iter->processed < 0)
1529
return iter->processed;
30+
if (!iter->processed && !stale)
31+
return 0;
1632
if (WARN_ON_ONCE(iter->processed > iomap_length(iter)))
1733
return -EIO;
1834
iter->pos += iter->processed;
@@ -33,6 +49,7 @@ static inline void iomap_iter_done(struct iomap_iter *iter)
3349
WARN_ON_ONCE(iter->iomap.offset > iter->pos);
3450
WARN_ON_ONCE(iter->iomap.length == 0);
3551
WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos);
52+
WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE);
3653

3754
trace_iomap_iter_dstmap(iter->inode, &iter->iomap);
3855
if (iter->srcmap.type != IOMAP_HOLE)

include/linux/iomap.h

Lines changed: 35 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -49,26 +49,35 @@ struct vm_fault;
4949
*
5050
* IOMAP_F_BUFFER_HEAD indicates that the file system requires the use of
5151
* buffer heads for this mapping.
52+
*
53+
* IOMAP_F_XATTR indicates that the iomap is for an extended attribute extent
54+
* rather than a file data extent.
5255
*/
53-
#define IOMAP_F_NEW 0x01
54-
#define IOMAP_F_DIRTY 0x02
55-
#define IOMAP_F_SHARED 0x04
56-
#define IOMAP_F_MERGED 0x08
57-
#define IOMAP_F_BUFFER_HEAD 0x10
58-
#define IOMAP_F_ZONE_APPEND 0x20
56+
#define IOMAP_F_NEW (1U << 0)
57+
#define IOMAP_F_DIRTY (1U << 1)
58+
#define IOMAP_F_SHARED (1U << 2)
59+
#define IOMAP_F_MERGED (1U << 3)
60+
#define IOMAP_F_BUFFER_HEAD (1U << 4)
61+
#define IOMAP_F_ZONE_APPEND (1U << 5)
62+
#define IOMAP_F_XATTR (1U << 6)
5963

6064
/*
6165
* Flags set by the core iomap code during operations:
6266
*
6367
* IOMAP_F_SIZE_CHANGED indicates to the iomap_end method that the file size
6468
* has changed as the result of this write operation.
69+
*
70+
* IOMAP_F_STALE indicates that the iomap is not valid any longer and the file
71+
* range it covers needs to be remapped by the high level before the operation
72+
* can proceed.
6573
*/
66-
#define IOMAP_F_SIZE_CHANGED 0x100
74+
#define IOMAP_F_SIZE_CHANGED (1U << 8)
75+
#define IOMAP_F_STALE (1U << 9)
6776

6877
/*
6978
* Flags from 0x1000 up are for file system specific usage:
7079
*/
71-
#define IOMAP_F_PRIVATE 0x1000
80+
#define IOMAP_F_PRIVATE (1U << 12)
7281

7382

7483
/*
@@ -89,6 +98,7 @@ struct iomap {
8998
void *inline_data;
9099
void *private; /* filesystem private */
91100
const struct iomap_page_ops *page_ops;
101+
u64 validity_cookie; /* used with .iomap_valid() */
92102
};
93103

94104
static inline sector_t iomap_sector(const struct iomap *iomap, loff_t pos)
@@ -128,6 +138,23 @@ struct iomap_page_ops {
128138
int (*page_prepare)(struct inode *inode, loff_t pos, unsigned len);
129139
void (*page_done)(struct inode *inode, loff_t pos, unsigned copied,
130140
struct page *page);
141+
142+
/*
143+
* Check that the cached iomap still maps correctly to the filesystem's
144+
* internal extent map. FS internal extent maps can change while iomap
145+
* is iterating a cached iomap, so this hook allows iomap to detect that
146+
* the iomap needs to be refreshed during a long running write
147+
* operation.
148+
*
149+
* The filesystem can store internal state (e.g. a sequence number) in
150+
* iomap->validity_cookie when the iomap is first mapped to be able to
151+
* detect changes between mapping time and whenever .iomap_valid() is
152+
* called.
153+
*
154+
* This is called with the folio over the specified file position held
155+
* locked by the iomap code.
156+
*/
157+
bool (*iomap_valid)(struct inode *inode, const struct iomap *iomap);
131158
};
132159

133160
/*

0 commit comments

Comments
 (0)