@@ -1394,16 +1394,53 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
13941394}
13951395EXPORT_SYMBOL_GPL (iomap_file_unshare );
13961396
1397- static loff_t iomap_zero_iter (struct iomap_iter * iter , bool * did_zero )
1397+ /*
1398+ * Flush the remaining range of the iter and mark the current mapping stale.
1399+ * This is used when zero range sees an unwritten mapping that may have had
1400+ * dirty pagecache over it.
1401+ */
1402+ static inline int iomap_zero_iter_flush_and_stale (struct iomap_iter * i )
1403+ {
1404+ struct address_space * mapping = i -> inode -> i_mapping ;
1405+ loff_t end = i -> pos + i -> len - 1 ;
1406+
1407+ i -> iomap .flags |= IOMAP_F_STALE ;
1408+ return filemap_write_and_wait_range (mapping , i -> pos , end );
1409+ }
1410+
1411+ static loff_t iomap_zero_iter (struct iomap_iter * iter , bool * did_zero ,
1412+ bool * range_dirty )
13981413{
13991414 const struct iomap * srcmap = iomap_iter_srcmap (iter );
14001415 loff_t pos = iter -> pos ;
14011416 loff_t length = iomap_length (iter );
14021417 loff_t written = 0 ;
14031418
1404- /* already zeroed? we're done. */
1405- if (srcmap -> type == IOMAP_HOLE || srcmap -> type == IOMAP_UNWRITTEN )
1419+ /*
1420+ * We must zero subranges of unwritten mappings that might be dirty in
1421+ * pagecache from previous writes. We only know whether the entire range
1422+ * was clean or not, however, and dirty folios may have been written
1423+ * back or reclaimed at any point after mapping lookup.
1424+ *
1425+ * The easiest way to deal with this is to flush pagecache to trigger
1426+ * any pending unwritten conversions and then grab the updated extents
1427+ * from the fs. The flush may change the current mapping, so mark it
1428+ * stale for the iterator to remap it for the next pass to handle
1429+ * properly.
1430+ *
1431+ * Note that holes are treated the same as unwritten because zero range
1432+ * is (ab)used for partial folio zeroing in some cases. Hole backed
1433+ * post-eof ranges can be dirtied via mapped write and the flush
1434+ * triggers writeback time post-eof zeroing.
1435+ */
1436+ if (srcmap -> type == IOMAP_HOLE || srcmap -> type == IOMAP_UNWRITTEN ) {
1437+ if (* range_dirty ) {
1438+ * range_dirty = false;
1439+ return iomap_zero_iter_flush_and_stale (iter );
1440+ }
1441+ /* range is clean and already zeroed, nothing to do */
14061442 return length ;
1443+ }
14071444
14081445 do {
14091446 struct folio * folio ;
@@ -1451,9 +1488,27 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
14511488 .flags = IOMAP_ZERO ,
14521489 };
14531490 int ret ;
1491+ bool range_dirty ;
1492+
1493+ /*
1494+ * Zero range wants to skip pre-zeroed (i.e. unwritten) mappings, but
1495+ * pagecache must be flushed to ensure stale data from previous
1496+ * buffered writes is not exposed. A flush is only required for certain
1497+ * types of mappings, but checking pagecache after mapping lookup is
1498+ * racy with writeback and reclaim.
1499+ *
1500+ * Therefore, check the entire range first and pass along whether any
1501+ * part of it is dirty. If so and an underlying mapping warrants it,
1502+ * flush the cache at that point. This trades off the occasional false
1503+ * positive (and spurious flush, if the dirty data and mapping don't
1504+ * happen to overlap) for simplicity in handling a relatively uncommon
1505+ * situation.
1506+ */
1507+ range_dirty = filemap_range_needs_writeback (inode -> i_mapping ,
1508+ pos , pos + len - 1 );
14541509
14551510 while ((ret = iomap_iter (& iter , ops )) > 0 )
1456- iter .processed = iomap_zero_iter (& iter , did_zero );
1511+ iter .processed = iomap_zero_iter (& iter , did_zero , & range_dirty );
14571512 return ret ;
14581513}
14591514EXPORT_SYMBOL_GPL (iomap_zero_range );
0 commit comments