Skip to content

Commit 946dcb6

Browse files
author
CKI KWF Bot
committed
Merge: ext4: partial zero eof block on unaligned inode size extension
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/7383 JIRA: https://issues.redhat.com/browse/RHEL-109217 Perform partial EOF block/page zeroing correctly in ext4 to prevent stale data exposure. Tested via fstests generic/363. Signed-off-by: Brian Foster <bfoster@redhat.com> Approved-by: Audra Mitchell <aubaker@redhat.com> Approved-by: Rafael Aquini <raquini@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: CKI GitLab Kmaint Pipeline Bot <26919896-cki-kmaint-pipeline-bot@users.noreply.gitlab.com>
2 parents 071b3c9 + 59d1118 commit 946dcb6

File tree

3 files changed

+124
-50
lines changed

3 files changed

+124
-50
lines changed

fs/ext4/extents.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4483,7 +4483,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
44834483
int depth = 0;
44844484
struct ext4_map_blocks map;
44854485
unsigned int credits;
4486-
loff_t epos;
4486+
loff_t epos, old_size = i_size_read(inode);
44874487

44884488
BUG_ON(!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS));
44894489
map.m_lblk = offset;
@@ -4541,6 +4541,11 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
45414541
epos = new_size;
45424542
if (ext4_update_inode_size(inode, epos) & 0x1)
45434543
inode->i_mtime = inode->i_ctime;
4544+
if (epos > old_size) {
4545+
pagecache_isize_extended(inode, old_size, epos);
4546+
ext4_zero_partial_blocks(handle, inode,
4547+
old_size, epos - old_size);
4548+
}
45444549
}
45454550
ret2 = ext4_mark_inode_dirty(handle, inode);
45464551
ext4_update_inode_fsync_trans(handle, inode, 1);

fs/ext4/inode.c

Lines changed: 86 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1359,8 +1359,10 @@ static int ext4_write_end(struct file *file,
13591359
unlock_page(page);
13601360
put_page(page);
13611361

1362-
if (old_size < pos && !verity)
1362+
if (old_size < pos && !verity) {
13631363
pagecache_isize_extended(inode, old_size, pos);
1364+
ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
1365+
}
13641366
/*
13651367
* Don't mark the inode dirty under page lock. First, it unnecessarily
13661368
* makes the holding time of page lock longer. Second, it forces lock
@@ -1473,8 +1475,10 @@ static int ext4_journalled_write_end(struct file *file,
14731475
unlock_page(page);
14741476
put_page(page);
14751477

1476-
if (old_size < pos && !verity)
1478+
if (old_size < pos && !verity) {
14771479
pagecache_isize_extended(inode, old_size, pos);
1480+
ext4_zero_partial_blocks(handle, inode, old_size, pos - old_size);
1481+
}
14781482

14791483
if (size_changed) {
14801484
ret2 = ext4_mark_inode_dirty(handle, inode);
@@ -3147,14 +3151,77 @@ static int ext4_da_should_update_i_disksize(struct page *page,
31473151
return 1;
31483152
}
31493153

3154+
static int ext4_da_do_write_end(struct address_space *mapping,
3155+
loff_t pos, unsigned len, unsigned copied,
3156+
struct page *page)
3157+
{
3158+
struct inode *inode = mapping->host;
3159+
loff_t old_size = inode->i_size;
3160+
bool disksize_changed = false;
3161+
loff_t new_i_size, zero_len = 0;
3162+
handle_t *handle;
3163+
3164+
/*
3165+
* block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
3166+
* flag, which all that's needed to trigger page writeback.
3167+
*/
3168+
copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL);
3169+
new_i_size = pos + copied;
3170+
3171+
/*
3172+
* It's important to update i_size while still holding page lock,
3173+
* because page writeout could otherwise come in and zero beyond
3174+
* i_size.
3175+
*
3176+
* Since we are holding inode lock, we are sure i_disksize <=
3177+
* i_size. We also know that if i_disksize < i_size, there are
3178+
* delalloc writes pending in the range up to i_size. If the end of
3179+
* the current write is <= i_size, there's no need to touch
3180+
* i_disksize since writeback will push i_disksize up to i_size
3181+
* eventually. If the end of the current write is > i_size and
3182+
* inside an allocated block which ext4_da_should_update_i_disksize()
3183+
* checked, we need to update i_disksize here as certain
3184+
* ext4_writepages() paths not allocating blocks and update i_disksize.
3185+
*/
3186+
if (new_i_size > inode->i_size) {
3187+
unsigned long end;
3188+
3189+
i_size_write(inode, new_i_size);
3190+
end = (new_i_size - 1) & (PAGE_SIZE - 1);
3191+
if (copied && ext4_da_should_update_i_disksize(page, end)) {
3192+
ext4_update_i_disksize(inode, new_i_size);
3193+
disksize_changed = true;
3194+
}
3195+
}
3196+
3197+
unlock_page(page);
3198+
put_page(page);
3199+
3200+
if (pos > old_size) {
3201+
pagecache_isize_extended(inode, old_size, pos);
3202+
zero_len = pos - old_size;
3203+
}
3204+
3205+
if (!disksize_changed && !zero_len)
3206+
return copied;
3207+
3208+
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
3209+
if (IS_ERR(handle))
3210+
return PTR_ERR(handle);
3211+
if (zero_len)
3212+
ext4_zero_partial_blocks(handle, inode, old_size, zero_len);
3213+
ext4_mark_inode_dirty(handle, inode);
3214+
ext4_journal_stop(handle);
3215+
3216+
return copied;
3217+
}
3218+
31503219
static int ext4_da_write_end(struct file *file,
31513220
struct address_space *mapping,
31523221
loff_t pos, unsigned len, unsigned copied,
31533222
struct page *page, void *fsdata)
31543223
{
31553224
struct inode *inode = mapping->host;
3156-
loff_t new_i_size;
3157-
unsigned long start, end;
31583225
int write_mode = (int)(unsigned long)fsdata;
31593226

31603227
if (write_mode == FALL_BACK_TO_NONDELALLOC)
@@ -3177,30 +3244,7 @@ static int ext4_da_write_end(struct file *file,
31773244
return -EIO;
31783245
}
31793246

3180-
start = pos & (PAGE_SIZE - 1);
3181-
end = start + copied - 1;
3182-
3183-
/*
3184-
* Since we are holding inode lock, we are sure i_disksize <=
3185-
* i_size. We also know that if i_disksize < i_size, there are
3186-
* delalloc writes pending in the range upto i_size. If the end of
3187-
* the current write is <= i_size, there's no need to touch
3188-
* i_disksize since writeback will push i_disksize upto i_size
3189-
* eventually. If the end of the current write is > i_size and
3190-
* inside an allocated block (ext4_da_should_update_i_disksize()
3191-
* check), we need to update i_disksize here as neither
3192-
* ext4_writepage() nor certain ext4_writepages() paths not
3193-
* allocating blocks update i_disksize.
3194-
*
3195-
* Note that we defer inode dirtying to generic_write_end() /
3196-
* ext4_da_write_inline_data_end().
3197-
*/
3198-
new_i_size = pos + copied;
3199-
if (copied && new_i_size > inode->i_size &&
3200-
ext4_da_should_update_i_disksize(page, end))
3201-
ext4_update_i_disksize(inode, new_i_size);
3202-
3203-
return generic_write_end(file, mapping, pos, len, copied, page, fsdata);
3247+
return ext4_da_do_write_end(mapping, pos, len, copied, page);
32043248
}
32053249

32063250
/*
@@ -5606,6 +5650,14 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
56065650
}
56075651

56085652
if (attr->ia_size != inode->i_size) {
5653+
/* attach jbd2 jinode for EOF folio tail zeroing */
5654+
if (attr->ia_size & (inode->i_sb->s_blocksize - 1) ||
5655+
oldsize & (inode->i_sb->s_blocksize - 1)) {
5656+
error = ext4_inode_attach_jinode(inode);
5657+
if (error)
5658+
goto out_mmap_sem;
5659+
}
5660+
56095661
handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
56105662
if (IS_ERR(handle)) {
56115663
error = PTR_ERR(handle);
@@ -5616,12 +5668,16 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
56165668
orphan = 1;
56175669
}
56185670
/*
5619-
* Update c/mtime on truncate up, ext4_truncate() will
5620-
* update c/mtime in shrink case below
5671+
* Update c/mtime and tail zero the EOF folio on
5672+
* truncate up. ext4_truncate() handles the shrink case
5673+
* below.
56215674
*/
56225675
if (!shrink) {
56235676
inode->i_mtime = current_time(inode);
56245677
inode->i_ctime = inode->i_mtime;
5678+
if (oldsize & (inode->i_sb->s_blocksize - 1))
5679+
ext4_block_truncate_page(handle,
5680+
inode->i_mapping, oldsize);
56255681
}
56265682

56275683
if (shrink)

mm/truncate.c

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -766,15 +766,15 @@ EXPORT_SYMBOL(truncate_setsize);
766766
* @from: original inode size
767767
* @to: new inode size
768768
*
769-
* Handle extension of inode size either caused by extending truncate or by
770-
* write starting after current i_size. We mark the page straddling current
771-
* i_size RO so that page_mkwrite() is called on the nearest write access to
772-
* the page. This way filesystem can be sure that page_mkwrite() is called on
773-
* the page before user writes to the page via mmap after the i_size has been
774-
* changed.
769+
* Handle extension of inode size either caused by extending truncate or
770+
* by write starting after current i_size. We mark the page straddling
771+
* current i_size RO so that page_mkwrite() is called on the first
772+
* write access to the page. The filesystem will update its per-block
773+
* information before user writes to the page via mmap after the i_size
774+
* has been changed.
775775
*
776776
* The function must be called after i_size is updated so that page fault
777-
* coming after we unlock the page will already see the new i_size.
777+
* coming after we unlock the folio will already see the new i_size.
778778
* The function must be called while we still hold i_rwsem - this not only
779779
* makes sure i_size is stable but also that userspace cannot observe new
780780
* i_size value before we are prepared to store mmap writes at new inode size.
@@ -783,31 +783,44 @@ void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to)
783783
{
784784
int bsize = i_blocksize(inode);
785785
loff_t rounded_from;
786-
struct page *page;
787-
pgoff_t index;
786+
struct folio *folio;
788787

789788
WARN_ON(to > inode->i_size);
790789

791-
if (from >= to || bsize == PAGE_SIZE)
790+
if (from >= to || bsize >= PAGE_SIZE)
792791
return;
793792
/* Page straddling @from will not have any hole block created? */
794793
rounded_from = round_up(from, bsize);
795794
if (to <= rounded_from || !(rounded_from & (PAGE_SIZE - 1)))
796795
return;
797796

798-
index = from >> PAGE_SHIFT;
799-
page = find_lock_page(inode->i_mapping, index);
800-
/* Page not cached? Nothing to do */
801-
if (!page)
797+
folio = filemap_lock_folio(inode->i_mapping, from / PAGE_SIZE);
798+
/* Folio not cached? Nothing to do */
799+
if (IS_ERR(folio))
802800
return;
803801
/*
804-
* See clear_page_dirty_for_io() for details why set_page_dirty()
802+
* See folio_clear_dirty_for_io() for details why folio_mark_dirty()
805803
* is needed.
806804
*/
807-
if (page_mkclean(page))
808-
set_page_dirty(page);
809-
unlock_page(page);
810-
put_page(page);
805+
if (folio_mkclean(folio))
806+
folio_mark_dirty(folio);
807+
808+
/*
809+
* The post-eof range of the folio must be zeroed before it is exposed
810+
* to the file. Writeback normally does this, but since i_size has been
811+
* increased we handle it here.
812+
*/
813+
if (folio_test_dirty(folio)) {
814+
unsigned int offset, end;
815+
816+
offset = from - folio_pos(folio);
817+
end = min_t(unsigned int, to - folio_pos(folio),
818+
folio_size(folio));
819+
folio_zero_segment(folio, offset, end);
820+
}
821+
822+
folio_unlock(folio);
823+
folio_put(folio);
811824
}
812825
EXPORT_SYMBOL(pagecache_isize_extended);
813826

0 commit comments

Comments
 (0)