Skip to content

Commit 98077f7

Browse files
adam900710kdave
authored andcommitted
btrfs: enable experimental bs > ps support
With all the preparation patches, we're able to finally enable btrfs block size (sector size) larger than page size support and give it a full fstests run. And obviously this new feature is hidden behind experimental flags, and should not be considered as a core feature yet as btrfs' default block size is still 4K. But this is still a feature that will shine in the future where 16K block sized device are widely adopted. For now there are some features explicitly disabled: - Direct IO This is the most complex part to support, the root reason is we can not control the pages of iov iter passed in. User space programs can only ensure the virtual addresses are contiguous, but have no control on their physical addresses. Our bs > ps support heavily relies on large folios, and direct IO memory can easily break it. So direct IO is disabled and will always fall back to buffered IO. - RAID56 In theory we can convert RAID56 to use large folios, but it will need to be converted back to page based if we want to support direct IO in the future. So just reject it for now. - Encoded send - Encoded read Both are utilizing btrfs_encoded_read_regular_fill_pages(), and send is utilizing vmallocated memory. Unfortunately for vmallocated memory we can not guarantee the minimal folio order. For send, it will just always fallback to regular writes, which reads from page cache and will follow the existing folio order requirement. - Encoded write Encoded write itself is allocating pages by themselves, and we can easily change it to follow the minimal order. But since encoded read is already disabled, there is no need to only enable encoded write. Finally just like what we did for bs < ps support in the past, add a warning message for bs > ps mounts. Signed-off-by: Qu Wenruo <wqu@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
1 parent e9bed72 commit 98077f7

File tree

5 files changed

+58
-15
lines changed

5 files changed

+58
-15
lines changed

fs/btrfs/direct-io.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -786,6 +786,18 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
786786
if (iov_iter_alignment(iter) & blocksize_mask)
787787
return -EINVAL;
788788

789+
/*
790+
* For bs > ps support, we heavily rely on large folios to make sure no
791+
* block will cross large folio boundaries.
792+
*
793+
* But memory provided by direct IO is only virtually contiguous, not
794+
* physically contiguous, and will break the btrfs' large folio requirement.
795+
*
796+
* So for bs > ps support, all direct IOs should fallback to buffered ones.
797+
*/
798+
if (fs_info->sectorsize > PAGE_SIZE)
799+
return -EINVAL;
800+
789801
return 0;
790802
}
791803

fs/btrfs/disk-io.c

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3242,18 +3242,24 @@ int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
32423242
}
32433243

32443244
/*
3245-
* Subpage runtime limitation on v1 cache.
3245+
* Subpage/bs > ps runtime limitation on v1 cache.
32463246
*
32473247
* V1 space cache still has some hard coded PAGE_SIZE usage, while
32483248
* we're already defaulting to v2 cache, no need to bother v1 as it's
32493249
* going to be deprecated anyway.
32503250
*/
3251-
if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
3251+
if (fs_info->sectorsize != PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
32523252
btrfs_warn(fs_info,
32533253
"v1 space cache is not supported for page size %lu with sectorsize %u",
32543254
PAGE_SIZE, fs_info->sectorsize);
32553255
return -EINVAL;
32563256
}
3257+
if (fs_info->sectorsize > PAGE_SIZE && btrfs_fs_incompat(fs_info, RAID56)) {
3258+
btrfs_err(fs_info,
3259+
"RAID56 is not supported for page size %lu with sectorsize %u",
3260+
PAGE_SIZE, fs_info->sectorsize);
3261+
return -EINVAL;
3262+
}
32573263

32583264
/* This can be called by remount, we need to protect the super block. */
32593265
spin_lock(&fs_info->super_lock);
@@ -3388,6 +3394,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
33883394
fs_info->stripesize = stripesize;
33893395
fs_info->fs_devices->fs_info = fs_info;
33903396

3397+
if (fs_info->sectorsize > PAGE_SIZE)
3398+
btrfs_warn(fs_info,
3399+
"support for block size %u with page size %zu is experimental, some features may be missing",
3400+
fs_info->sectorsize, PAGE_SIZE);
33913401
/*
33923402
* Handle the space caching options appropriately now that we have the
33933403
* super block loaded and validated.

fs/btrfs/fs.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,7 @@ bool __attribute_const__ btrfs_supported_blocksize(u32 blocksize)
9797
*/
9898
if (IS_ENABLED(CONFIG_HIGHMEM) && blocksize > PAGE_SIZE)
9999
return false;
100-
if (blocksize <= PAGE_SIZE)
101-
return true;
100+
return true;
102101
#endif
103102
return false;
104103
}

fs/btrfs/ioctl.c

Lines changed: 25 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -4418,6 +4418,10 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
44184418
goto out_acct;
44194419
}
44204420

4421+
if (fs_info->sectorsize > PAGE_SIZE) {
4422+
ret = -ENOTTY;
4423+
goto out_acct;
4424+
}
44214425
if (compat) {
44224426
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
44234427
struct btrfs_ioctl_encoded_io_args_32 args32;
@@ -4509,6 +4513,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
45094513

45104514
static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat)
45114515
{
4516+
struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode);
45124517
struct btrfs_ioctl_encoded_io_args args;
45134518
struct iovec iovstack[UIO_FASTIOV];
45144519
struct iovec *iov = iovstack;
@@ -4522,6 +4527,11 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
45224527
goto out_acct;
45234528
}
45244529

4530+
if (fs_info->sectorsize > PAGE_SIZE) {
4531+
ret = -ENOTTY;
4532+
goto out_acct;
4533+
}
4534+
45254535
if (!(file->f_mode & FMODE_WRITE)) {
45264536
ret = -EBADF;
45274537
goto out_acct;
@@ -4780,14 +4790,14 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
47804790

47814791
static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
47824792
{
4793+
struct file *file = cmd->file;
4794+
struct btrfs_inode *inode = BTRFS_I(file->f_inode);
4795+
struct extent_io_tree *io_tree = &inode->io_tree;
4796+
struct btrfs_fs_info *fs_info = inode->root->fs_info;
47834797
size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
47844798
size_t copy_end;
47854799
int ret;
47864800
u64 disk_bytenr, disk_io_size;
4787-
struct file *file;
4788-
struct btrfs_inode *inode;
4789-
struct btrfs_fs_info *fs_info;
4790-
struct extent_io_tree *io_tree;
47914801
loff_t pos;
47924802
struct kiocb kiocb;
47934803
struct extent_state *cached_state = NULL;
@@ -4803,10 +4813,11 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
48034813
ret = -EPERM;
48044814
goto out_acct;
48054815
}
4806-
file = cmd->file;
4807-
inode = BTRFS_I(file->f_inode);
4808-
fs_info = inode->root->fs_info;
4809-
io_tree = &inode->io_tree;
4816+
if (fs_info->sectorsize > PAGE_SIZE) {
4817+
ret = -ENOTTY;
4818+
goto out_acct;
4819+
}
4820+
48104821
sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
48114822

48124823
if (issue_flags & IO_URING_F_COMPAT) {
@@ -4933,9 +4944,10 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue
49334944

49344945
static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags)
49354946
{
4947+
struct file *file = cmd->file;
4948+
struct btrfs_fs_info *fs_info = inode_to_fs_info(file->f_inode);
49364949
loff_t pos;
49374950
struct kiocb kiocb;
4938-
struct file *file;
49394951
ssize_t ret;
49404952
void __user *sqe_addr;
49414953
struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
@@ -4948,8 +4960,11 @@ static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issu
49484960
ret = -EPERM;
49494961
goto out_acct;
49504962
}
4963+
if (fs_info->sectorsize > PAGE_SIZE) {
4964+
ret = -ENOTTY;
4965+
goto out_acct;
4966+
}
49514967

4952-
file = cmd->file;
49534968
sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
49544969

49554970
if (!(file->f_mode & FMODE_WRITE)) {

fs/btrfs/send.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5654,7 +5654,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
56545654

56555655
ei = btrfs_item_ptr(leaf, path->slots[0],
56565656
struct btrfs_file_extent_item);
5657-
if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
5657+
/*
5658+
* Do not go through encoded read for bs > ps cases.
5659+
*
5660+
* Encoded send is using vmallocated pages as buffer, which we can
5661+
* not ensure every folio is large enough to contain a block.
5662+
*/
5663+
if (sctx->send_root->fs_info->sectorsize <= PAGE_SIZE &&
5664+
(sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
56585665
btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
56595666
bool is_inline = (btrfs_file_extent_type(leaf, ei) ==
56605667
BTRFS_FILE_EXTENT_INLINE);

0 commit comments

Comments
 (0)