@@ -590,7 +590,7 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter,
590590 return iomap_read_inline_data (iter , page );
591591}
592592
593- static int iomap_write_begin (const struct iomap_iter * iter , loff_t pos ,
593+ static int iomap_write_begin (struct iomap_iter * iter , loff_t pos ,
594594 unsigned len , struct page * * pagep )
595595{
596596 const struct iomap_page_ops * page_ops = iter -> iomap .page_ops ;
@@ -618,6 +618,26 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
618618 goto out_no_page ;
619619 }
620620
621+ /*
622+ * Now we have a locked folio, before we do anything with it we need to
623+ * check that the iomap we have cached is not stale. The inode extent
624+ * mapping can change due to concurrent IO in flight (e.g.
625+ * IOMAP_UNWRITTEN state can change and memory reclaim could have
626+ * reclaimed a previously partially written page at this index after IO
627+ * completion before this write reaches this file offset) and hence we
628+ * could do the wrong thing here (zero a page range incorrectly or fail
629+ * to zero) and corrupt data.
630+ */
631+ if (page_ops && page_ops -> iomap_valid ) {
632+ bool iomap_valid = page_ops -> iomap_valid (iter -> inode ,
633+ & iter -> iomap );
634+ if (!iomap_valid ) {
635+ iter -> iomap .flags |= IOMAP_F_STALE ;
636+ status = 0 ;
637+ goto out_unlock ;
638+ }
639+ }
640+
621641 if (srcmap -> type == IOMAP_INLINE )
622642 status = iomap_write_begin_inline (iter , page );
623643 else if (srcmap -> flags & IOMAP_F_BUFFER_HEAD )
@@ -757,6 +777,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
757777 status = iomap_write_begin (iter , pos , bytes , & page );
758778 if (unlikely (status ))
759779 break ;
780+ if (iter -> iomap .flags & IOMAP_F_STALE )
781+ break ;
760782
761783 if (mapping_writably_mapped (iter -> inode -> i_mapping ))
762784 flush_dcache_page (page );
@@ -810,6 +832,231 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
810832}
811833EXPORT_SYMBOL_GPL (iomap_file_buffered_write );
812834
835+ /*
836+ * Scan the data range passed to us for dirty page cache folios. If we find a
837+ * dirty folio, punch out the preceeding range and update the offset from which
838+ * the next punch will start from.
839+ *
840+ * We can punch out storage reservations under clean pages because they either
841+ * contain data that has been written back - in which case the delalloc punch
842+ * over that range is a no-op - or they have been read faults in which case they
843+ * contain zeroes and we can remove the delalloc backing range and any new
844+ * writes to those pages will do the normal hole filling operation...
845+ *
846+ * This makes the logic simple: we only need to keep the delalloc extents only
847+ * over the dirty ranges of the page cache.
848+ *
849+ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
850+ * simplify range iterations.
851+ */
852+ static int iomap_write_delalloc_scan (struct inode * inode ,
853+ loff_t * punch_start_byte , loff_t start_byte , loff_t end_byte ,
854+ int (* punch )(struct inode * inode , loff_t offset , loff_t length ))
855+ {
856+ while (start_byte < end_byte ) {
857+ struct folio * folio ;
858+
859+ /* grab locked page */
860+ folio = filemap_lock_folio (inode -> i_mapping ,
861+ start_byte >> PAGE_SHIFT );
862+ if (!folio ) {
863+ start_byte = ALIGN_DOWN (start_byte , PAGE_SIZE ) +
864+ PAGE_SIZE ;
865+ continue ;
866+ }
867+
868+ /* if dirty, punch up to offset */
869+ if (folio_test_dirty (folio )) {
870+ if (start_byte > * punch_start_byte ) {
871+ int error ;
872+
873+ error = punch (inode , * punch_start_byte ,
874+ start_byte - * punch_start_byte );
875+ if (error ) {
876+ folio_unlock (folio );
877+ folio_put (folio );
878+ return error ;
879+ }
880+ }
881+
882+ /*
883+ * Make sure the next punch start is correctly bound to
884+ * the end of this data range, not the end of the folio.
885+ */
886+ * punch_start_byte = min_t (loff_t , end_byte ,
887+ folio_next_index (folio ) << PAGE_SHIFT );
888+ }
889+
890+ /* move offset to start of next folio in range */
891+ start_byte = folio_next_index (folio ) << PAGE_SHIFT ;
892+ folio_unlock (folio );
893+ folio_put (folio );
894+ }
895+ return 0 ;
896+ }
897+
898+ /*
899+ * Punch out all the delalloc blocks in the range given except for those that
900+ * have dirty data still pending in the page cache - those are going to be
901+ * written and so must still retain the delalloc backing for writeback.
902+ *
903+ * As we are scanning the page cache for data, we don't need to reimplement the
904+ * wheel - mapping_seek_hole_data() does exactly what we need to identify the
905+ * start and end of data ranges correctly even for sub-folio block sizes. This
906+ * byte range based iteration is especially convenient because it means we
907+ * don't have to care about variable size folios, nor where the start or end of
908+ * the data range lies within a folio, if they lie within the same folio or even
909+ * if there are multiple discontiguous data ranges within the folio.
910+ *
911+ * It should be noted that mapping_seek_hole_data() is not aware of EOF, and so
912+ * can return data ranges that exist in the cache beyond EOF. e.g. a page fault
913+ * spanning EOF will initialise the post-EOF data to zeroes and mark it up to
914+ * date. A write page fault can then mark it dirty. If we then fail a write()
915+ * beyond EOF into that up to date cached range, we allocate a delalloc block
916+ * beyond EOF and then have to punch it out. Because the range is up to date,
917+ * mapping_seek_hole_data() will return it, and we will skip the punch because
918+ * the folio is dirty. THis is incorrect - we always need to punch out delalloc
919+ * beyond EOF in this case as writeback will never write back and covert that
920+ * delalloc block beyond EOF. Hence we limit the cached data scan range to EOF,
921+ * resulting in always punching out the range from the EOF to the end of the
922+ * range the iomap spans.
923+ *
924+ * Intervals are of the form [start_byte, end_byte) (i.e. open ended) because it
925+ * matches the intervals returned by mapping_seek_hole_data(). i.e. SEEK_DATA
926+ * returns the start of a data range (start_byte), and SEEK_HOLE(start_byte)
927+ * returns the end of the data range (data_end). Using closed intervals would
928+ * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose
929+ * the code to subtle off-by-one bugs....
930+ */
931+ static int iomap_write_delalloc_release (struct inode * inode ,
932+ loff_t start_byte , loff_t end_byte ,
933+ int (* punch )(struct inode * inode , loff_t pos , loff_t length ))
934+ {
935+ loff_t punch_start_byte = start_byte ;
936+ loff_t scan_end_byte = min (i_size_read (inode ), end_byte );
937+ int error = 0 ;
938+
939+ /*
940+ * Lock the mapping to avoid races with page faults re-instantiating
941+ * folios and dirtying them via ->page_mkwrite whilst we walk the
942+ * cache and perform delalloc extent removal. Failing to do this can
943+ * leave dirty pages with no space reservation in the cache.
944+ */
945+ filemap_invalidate_lock (inode -> i_mapping );
946+ while (start_byte < scan_end_byte ) {
947+ loff_t data_end ;
948+
949+ start_byte = mapping_seek_hole_data (inode -> i_mapping ,
950+ start_byte , scan_end_byte , SEEK_DATA );
951+ /*
952+ * If there is no more data to scan, all that is left is to
953+ * punch out the remaining range.
954+ */
955+ if (start_byte == - ENXIO || start_byte == scan_end_byte )
956+ break ;
957+ if (start_byte < 0 ) {
958+ error = start_byte ;
959+ goto out_unlock ;
960+ }
961+ WARN_ON_ONCE (start_byte < punch_start_byte );
962+ WARN_ON_ONCE (start_byte > scan_end_byte );
963+
964+ /*
965+ * We find the end of this contiguous cached data range by
966+ * seeking from start_byte to the beginning of the next hole.
967+ */
968+ data_end = mapping_seek_hole_data (inode -> i_mapping , start_byte ,
969+ scan_end_byte , SEEK_HOLE );
970+ if (data_end < 0 ) {
971+ error = data_end ;
972+ goto out_unlock ;
973+ }
974+ WARN_ON_ONCE (data_end <= start_byte );
975+ WARN_ON_ONCE (data_end > scan_end_byte );
976+
977+ error = iomap_write_delalloc_scan (inode , & punch_start_byte ,
978+ start_byte , data_end , punch );
979+ if (error )
980+ goto out_unlock ;
981+
982+ /* The next data search starts at the end of this one. */
983+ start_byte = data_end ;
984+ }
985+
986+ if (punch_start_byte < end_byte )
987+ error = punch (inode , punch_start_byte ,
988+ end_byte - punch_start_byte );
989+ out_unlock :
990+ filemap_invalidate_unlock (inode -> i_mapping );
991+ return error ;
992+ }
993+
994+ /*
995+ * When a short write occurs, the filesystem may need to remove reserved space
996+ * that was allocated in ->iomap_begin from it's ->iomap_end method. For
997+ * filesystems that use delayed allocation, we need to punch out delalloc
998+ * extents from the range that are not dirty in the page cache. As the write can
999+ * race with page faults, there can be dirty pages over the delalloc extent
1000+ * outside the range of a short write but still within the delalloc extent
1001+ * allocated for this iomap.
1002+ *
1003+ * This function uses [start_byte, end_byte) intervals (i.e. open ended) to
1004+ * simplify range iterations.
1005+ *
1006+ * The punch() callback *must* only punch delalloc extents in the range passed
1007+ * to it. It must skip over all other types of extents in the range and leave
1008+ * them completely unchanged. It must do this punch atomically with respect to
1009+ * other extent modifications.
1010+ *
1011+ * The punch() callback may be called with a folio locked to prevent writeback
1012+ * extent allocation racing at the edge of the range we are currently punching.
1013+ * The locked folio may or may not cover the range being punched, so it is not
1014+ * safe for the punch() callback to lock folios itself.
1015+ *
1016+ * Lock order is:
1017+ *
1018+ * inode->i_rwsem (shared or exclusive)
1019+ * inode->i_mapping->invalidate_lock (exclusive)
1020+ * folio_lock()
1021+ * ->punch
1022+ * internal filesystem allocation lock
1023+ */
1024+ int iomap_file_buffered_write_punch_delalloc (struct inode * inode ,
1025+ struct iomap * iomap , loff_t pos , loff_t length ,
1026+ ssize_t written ,
1027+ int (* punch )(struct inode * inode , loff_t pos , loff_t length ))
1028+ {
1029+ loff_t start_byte ;
1030+ loff_t end_byte ;
1031+ int blocksize = i_blocksize (inode );
1032+
1033+ if (iomap -> type != IOMAP_DELALLOC )
1034+ return 0 ;
1035+
1036+ /* If we didn't reserve the blocks, we're not allowed to punch them. */
1037+ if (!(iomap -> flags & IOMAP_F_NEW ))
1038+ return 0 ;
1039+
1040+ /*
1041+ * start_byte refers to the first unused block after a short write. If
1042+ * nothing was written, round offset down to point at the first block in
1043+ * the range.
1044+ */
1045+ if (unlikely (!written ))
1046+ start_byte = round_down (pos , blocksize );
1047+ else
1048+ start_byte = round_up (pos + written , blocksize );
1049+ end_byte = round_up (pos + length , blocksize );
1050+
1051+ /* Nothing to do if we've written the entire delalloc extent */
1052+ if (start_byte >= end_byte )
1053+ return 0 ;
1054+
1055+ return iomap_write_delalloc_release (inode , start_byte , end_byte ,
1056+ punch );
1057+ }
1058+ EXPORT_SYMBOL_GPL (iomap_file_buffered_write_punch_delalloc );
1059+
8131060static loff_t iomap_unshare_iter (struct iomap_iter * iter )
8141061{
8151062 struct iomap * iomap = & iter -> iomap ;
@@ -834,6 +1081,8 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter)
8341081 status = iomap_write_begin (iter , pos , bytes , & page );
8351082 if (unlikely (status ))
8361083 return status ;
1084+ if (iter -> iomap .flags & IOMAP_F_STALE )
1085+ break ;
8371086
8381087 status = iomap_write_end (iter , pos , bytes , bytes , page );
8391088 if (WARN_ON_ONCE (status == 0 ))
@@ -879,6 +1128,8 @@ static s64 __iomap_zero_iter(struct iomap_iter *iter, loff_t pos, u64 length)
8791128 status = iomap_write_begin (iter , pos , bytes , & page );
8801129 if (status )
8811130 return status ;
1131+ if (iter -> iomap .flags & IOMAP_F_STALE )
1132+ return status ;
8821133
8831134 zero_user (page , offset , bytes );
8841135 mark_page_accessed (page );
@@ -907,6 +1158,8 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero)
9071158 bytes = __iomap_zero_iter (iter , pos , length );
9081159 if (bytes < 0 )
9091160 return bytes ;
1161+ if (!bytes && iter -> iomap .flags & IOMAP_F_STALE )
1162+ break ;
9101163
9111164 pos += bytes ;
9121165 length -= bytes ;
0 commit comments