Skip to content

Commit f59930c

Browse files
committed
Merge: block: fix zoned split
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-10/-/merge_requests/1464 block: fix zoned split JIRA: https://issues.redhat.com/browse/RHEL-97177 Signed-off-by: Ming Lei <ming.lei@redhat.com> Approved-by: Jeff Moyer <jmoyer@redhat.com> Approved-by: Ewan D. Milne <emilne@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: Scott Weaver <scweaver@redhat.com>
2 parents a629960 + 4e4320a commit f59930c

File tree

6 files changed

+140
-50
lines changed

6 files changed

+140
-50
lines changed

block/blk-mq.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3169,8 +3169,10 @@ void blk_mq_submit_bio(struct bio *bio)
31693169
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
31703170
goto queue_exit;
31713171

3172-
if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
3173-
goto queue_exit;
3172+
if (bio_needs_zone_write_plugging(bio)) {
3173+
if (blk_zone_plug_bio(bio, nr_segs))
3174+
goto queue_exit;
3175+
}
31743176

31753177
new_request:
31763178
if (rq) {

block/blk-zoned.c

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,25 +1113,7 @@ bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
11131113
{
11141114
struct block_device *bdev = bio->bi_bdev;
11151115

1116-
if (!bdev->bd_disk->zone_wplugs_hash)
1117-
return false;
1118-
1119-
/*
1120-
* If the BIO already has the plugging flag set, then it was already
1121-
* handled through this path and this is a submission from the zone
1122-
* plug bio submit work.
1123-
*/
1124-
if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
1125-
return false;
1126-
1127-
/*
1128-
* We do not need to do anything special for empty flush BIOs, e.g
1129-
* BIOs such as issued by blkdev_issue_flush(). The is because it is
1130-
* the responsibility of the user to first wait for the completion of
1131-
* write operations for flush to have any effect on the persistence of
1132-
* the written data.
1133-
*/
1134-
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
1116+
if (WARN_ON_ONCE(!bdev->bd_disk->zone_wplugs_hash))
11351117
return false;
11361118

11371119
/*

drivers/md/dm-crypt.c

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -253,17 +253,35 @@ MODULE_PARM_DESC(max_read_size, "Maximum size of a read request");
253253
static unsigned int max_write_size = 0;
254254
module_param(max_write_size, uint, 0644);
255255
MODULE_PARM_DESC(max_write_size, "Maximum size of a write request");
256-
static unsigned get_max_request_size(struct crypt_config *cc, bool wrt)
256+
257+
static unsigned get_max_request_sectors(struct dm_target *ti, struct bio *bio)
257258
{
259+
struct crypt_config *cc = ti->private;
258260
unsigned val, sector_align;
259-
val = !wrt ? READ_ONCE(max_read_size) : READ_ONCE(max_write_size);
260-
if (likely(!val))
261-
val = !wrt ? DM_CRYPT_DEFAULT_MAX_READ_SIZE : DM_CRYPT_DEFAULT_MAX_WRITE_SIZE;
262-
if (wrt || cc->used_tag_size) {
263-
if (unlikely(val > BIO_MAX_VECS << PAGE_SHIFT))
264-
val = BIO_MAX_VECS << PAGE_SHIFT;
265-
}
266-
sector_align = max(bdev_logical_block_size(cc->dev->bdev), (unsigned)cc->sector_size);
261+
bool wrt = op_is_write(bio_op(bio));
262+
263+
if (wrt) {
264+
/*
265+
* For zoned devices, splitting write operations creates the
266+
* risk of deadlocking queue freeze operations with zone write
267+
* plugging BIO work when the reminder of a split BIO is
268+
* issued. So always allow the entire BIO to proceed.
269+
*/
270+
if (ti->emulate_zone_append)
271+
return bio_sectors(bio);
272+
273+
val = min_not_zero(READ_ONCE(max_write_size),
274+
DM_CRYPT_DEFAULT_MAX_WRITE_SIZE);
275+
} else {
276+
val = min_not_zero(READ_ONCE(max_read_size),
277+
DM_CRYPT_DEFAULT_MAX_READ_SIZE);
278+
}
279+
280+
if (wrt || cc->used_tag_size)
281+
val = min(val, BIO_MAX_VECS << PAGE_SHIFT);
282+
283+
sector_align = max(bdev_logical_block_size(cc->dev->bdev),
284+
(unsigned)cc->sector_size);
267285
val = round_down(val, sector_align);
268286
if (unlikely(!val))
269287
val = sector_align;
@@ -3514,7 +3532,7 @@ static int crypt_map(struct dm_target *ti, struct bio *bio)
35143532
/*
35153533
* Check if bio is too large, split as needed.
35163534
*/
3517-
max_sectors = get_max_request_size(cc, bio_data_dir(bio) == WRITE);
3535+
max_sectors = get_max_request_sectors(ti, bio);
35183536
if (unlikely(bio_sectors(bio) > max_sectors))
35193537
dm_accept_partial_bio(bio, max_sectors);
35203538

@@ -3751,6 +3769,17 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
37513769
max_t(unsigned int, limits->physical_block_size, cc->sector_size);
37523770
limits->io_min = max_t(unsigned int, limits->io_min, cc->sector_size);
37533771
limits->dma_alignment = limits->logical_block_size - 1;
3772+
3773+
/*
3774+
* For zoned dm-crypt targets, there will be no internal splitting of
3775+
* write BIOs to avoid exceeding BIO_MAX_VECS vectors per BIO. But
3776+
* without respecting this limit, crypt_alloc_buffer() will trigger a
3777+
* BUG(). Avoid this by forcing DM core to split write BIOs to this
3778+
* limit.
3779+
*/
3780+
if (ti->emulate_zone_append)
3781+
limits->max_hw_sectors = min(limits->max_hw_sectors,
3782+
BIO_MAX_VECS << PAGE_SECTORS_SHIFT);
37543783
}
37553784

37563785
static struct target_type crypt_target = {

drivers/md/dm.c

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1309,8 +1309,9 @@ static size_t dm_dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
13091309
/*
13101310
* A target may call dm_accept_partial_bio only from the map routine. It is
13111311
* allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
1312-
* operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by
1313-
* __send_duplicate_bios().
1312+
* operations, zone append writes (native with REQ_OP_ZONE_APPEND or emulated
1313+
* with write BIOs flagged with BIO_EMULATES_ZONE_APPEND) and any bio serviced
1314+
* by __send_duplicate_bios().
13141315
*
13151316
* dm_accept_partial_bio informs the dm that the target only wants to process
13161317
* additional n_sectors sectors of the bio and the rest of the data should be
@@ -1343,11 +1344,19 @@ void dm_accept_partial_bio(struct bio *bio, unsigned int n_sectors)
13431344
unsigned int bio_sectors = bio_sectors(bio);
13441345

13451346
BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
1346-
BUG_ON(op_is_zone_mgmt(bio_op(bio)));
1347-
BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
13481347
BUG_ON(bio_sectors > *tio->len_ptr);
13491348
BUG_ON(n_sectors > bio_sectors);
13501349

1350+
if (static_branch_unlikely(&zoned_enabled) &&
1351+
unlikely(bdev_is_zoned(bio->bi_bdev))) {
1352+
enum req_op op = bio_op(bio);
1353+
1354+
BUG_ON(op_is_zone_mgmt(op));
1355+
BUG_ON(op == REQ_OP_WRITE);
1356+
BUG_ON(op == REQ_OP_WRITE_ZEROES);
1357+
BUG_ON(op == REQ_OP_ZONE_APPEND);
1358+
}
1359+
13511360
*tio->len_ptr -= bio_sectors - n_sectors;
13521361
bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
13531362

@@ -1792,19 +1801,35 @@ static void init_clone_info(struct clone_info *ci, struct dm_io *io,
17921801
}
17931802

17941803
#ifdef CONFIG_BLK_DEV_ZONED
1795-
static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
1796-
struct bio *bio)
1804+
static inline bool dm_zone_bio_needs_split(struct bio *bio)
17971805
{
17981806
/*
1799-
* For mapped device that need zone append emulation, we must
1800-
* split any large BIO that straddles zone boundaries.
1807+
* Special case the zone operations that cannot or should not be split.
18011808
*/
1802-
return dm_emulate_zone_append(md) && bio_straddles_zones(bio) &&
1803-
!bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
1809+
switch (bio_op(bio)) {
1810+
case REQ_OP_ZONE_APPEND:
1811+
case REQ_OP_ZONE_FINISH:
1812+
case REQ_OP_ZONE_RESET:
1813+
case REQ_OP_ZONE_RESET_ALL:
1814+
return false;
1815+
default:
1816+
break;
1817+
}
1818+
1819+
/*
1820+
* When mapped devices use the block layer zone write plugging, we must
1821+
* split any large BIO to the mapped device limits to not submit BIOs
1822+
* that span zone boundaries and to avoid potential deadlocks with
1823+
* queue freeze operations.
1824+
*/
1825+
return bio_needs_zone_write_plugging(bio) || bio_straddles_zones(bio);
18041826
}
1827+
18051828
static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
18061829
{
1807-
return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0);
1830+
if (!bio_needs_zone_write_plugging(bio))
1831+
return false;
1832+
return blk_zone_plug_bio(bio, 0);
18081833
}
18091834

18101835
static blk_status_t __send_zone_reset_all_emulated(struct clone_info *ci,
@@ -1920,8 +1945,7 @@ static blk_status_t __send_zone_reset_all(struct clone_info *ci)
19201945
}
19211946

19221947
#else
1923-
static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
1924-
struct bio *bio)
1948+
static inline bool dm_zone_bio_needs_split(struct bio *bio)
19251949
{
19261950
return false;
19271951
}
@@ -1948,9 +1972,7 @@ static void dm_split_and_process_bio(struct mapped_device *md,
19481972

19491973
is_abnormal = is_abnormal_io(bio);
19501974
if (static_branch_unlikely(&zoned_enabled)) {
1951-
/* Special case REQ_OP_ZONE_RESET_ALL as it cannot be split. */
1952-
need_split = (bio_op(bio) != REQ_OP_ZONE_RESET_ALL) &&
1953-
(is_abnormal || dm_zone_bio_needs_split(md, bio));
1975+
need_split = is_abnormal || dm_zone_bio_needs_split(bio);
19541976
} else {
19551977
need_split = is_abnormal;
19561978
}

include/linux/blk_types.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -350,11 +350,11 @@ enum req_op {
350350
/* Close a zone */
351351
REQ_OP_ZONE_CLOSE = (__force blk_opf_t)11,
352352
/* Transition a zone to full */
353-
REQ_OP_ZONE_FINISH = (__force blk_opf_t)12,
353+
REQ_OP_ZONE_FINISH = (__force blk_opf_t)13,
354354
/* reset a zone write pointer */
355-
REQ_OP_ZONE_RESET = (__force blk_opf_t)13,
355+
REQ_OP_ZONE_RESET = (__force blk_opf_t)15,
356356
/* reset all the zone present on the device */
357-
REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)15,
357+
REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)17,
358358

359359
/* Driver private requests */
360360
REQ_OP_DRV_IN = (__force blk_opf_t)34,

include/linux/blkdev.h

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -719,12 +719,67 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk)
719719
{
720720
return disk->nr_zones;
721721
}
722+
723+
/**
724+
* bio_needs_zone_write_plugging - Check if a BIO needs to be handled with zone
725+
* write plugging
726+
* @bio: The BIO being submitted
727+
*
728+
* Return true whenever @bio execution needs to be handled through zone
729+
* write plugging (using blk_zone_plug_bio()). Return false otherwise.
730+
*/
731+
static inline bool bio_needs_zone_write_plugging(struct bio *bio)
732+
{
733+
enum req_op op = bio_op(bio);
734+
735+
/*
736+
* Only zoned block devices have a zone write plug hash table. But not
737+
* all of them have one (e.g. DM devices may not need one).
738+
*/
739+
if (!bio->bi_bdev->bd_disk->zone_wplugs_hash)
740+
return false;
741+
742+
/* Only write operations need zone write plugging. */
743+
if (!op_is_write(op))
744+
return false;
745+
746+
/* Ignore empty flush */
747+
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
748+
return false;
749+
750+
/* Ignore BIOs that already have been handled by zone write plugging. */
751+
if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
752+
return false;
753+
754+
/*
755+
* All zone write operations must be handled through zone write plugging
756+
* using blk_zone_plug_bio().
757+
*/
758+
switch (op) {
759+
case REQ_OP_ZONE_APPEND:
760+
case REQ_OP_WRITE:
761+
case REQ_OP_WRITE_ZEROES:
762+
case REQ_OP_ZONE_FINISH:
763+
case REQ_OP_ZONE_RESET:
764+
case REQ_OP_ZONE_RESET_ALL:
765+
return true;
766+
default:
767+
return false;
768+
}
769+
}
770+
722771
bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs);
723772
#else /* CONFIG_BLK_DEV_ZONED */
724773
static inline unsigned int disk_nr_zones(struct gendisk *disk)
725774
{
726775
return 0;
727776
}
777+
778+
static inline bool bio_needs_zone_write_plugging(struct bio *bio)
779+
{
780+
return false;
781+
}
782+
728783
static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
729784
{
730785
return false;

0 commit comments

Comments
 (0)