Skip to content

Commit 5832d26

Browse files
committed
Merge tag 'for-6.18/io_uring-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux
Pull io_uring updates from Jens Axboe: - Store ring provided buffers locally for the users, rather than stuff them into struct io_kiocb. These types of buffers must always be fully consumed or recycled in the current context, and leaving them in struct io_kiocb is hence not a good ideas as that struct has a vastly different life time. Basically just an architecture cleanup that can help prevent issues with ring provided buffers in the future. - Support for mixed CQE sizes in the same ring. Before this change, a CQ ring either used the default 16b CQEs, or it was setup with 32b CQE using IORING_SETUP_CQE32. For use cases where a few 32b CQEs were needed, this caused everything else to use big CQEs. This is wasteful both in terms of memory usage, but also memory bandwidth for the posted CQEs. With IORING_SETUP_CQE_MIXED, applications may use request types that post both normal 16b and big 32b CQEs on the same ring. - Add helpers for async data management, to make it harder for opcode handlers to mess it up. - Add support for multishot for uring_cmd, which ublk can use. This helps improve efficiency, by providing a persistent request type that can trigger multiple CQEs. - Add initial support for ring feature querying. We had basic support for probe operations, but the API isn't great. Rather than expand that, add support for QUERY which is easily expandable and can cover a lot more cases than the existing probe support. This will help applications get a better idea of what operations are supported on a given host. - zcrx improvements from Pavel: - Improve refill entry alignment for better caching - Various cleanups, especially around deduplicating normal memory vs dmabuf setup. - Generalisation of the niov size (Patch 12). It's still hard coded to PAGE_SIZE on init, but will let the user to specify the rx buffer length on setup. - Syscall / synchronous bufer return. It'll be used as a slow fallback path for returning buffers when the refill queue is full. Useful for tolerating slight queue size misconfiguration or with inconsistent load. - Accounting more memory to cgroups. - Additional independent cleanups that will also be useful for mutli-area support. - Various fixes and cleanups * tag 'for-6.18/io_uring-20250929' of git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux: (68 commits) io_uring/cmd: drop unused res2 param from io_uring_cmd_done() io_uring: fix nvme's 32b cqes on mixed cq io_uring/query: cap number of queries io_uring/query: prevent infinite loops io_uring/zcrx: account niov arrays to cgroup io_uring/zcrx: allow synchronous buffer return io_uring/zcrx: introduce io_parse_rqe() io_uring/zcrx: don't adjust free cache space io_uring/zcrx: use guards for the refill lock io_uring/zcrx: reduce netmem scope in refill io_uring/zcrx: protect netdev with pp_lock io_uring/zcrx: rename dma lock io_uring/zcrx: make niov size variable io_uring/zcrx: set sgt for umem area io_uring/zcrx: remove dmabuf_offset io_uring/zcrx: deduplicate area mapping io_uring/zcrx: pass ifq to io_zcrx_alloc_fallback() io_uring/zcrx: check all niovs filled with dma addresses io_uring/zcrx: move area reg checks into io_import_area io_uring/zcrx: don't pass slot to io_zcrx_create_area ...
2 parents 77633c7 + ef9f603 commit 5832d26

File tree

37 files changed

+1001
-452
lines changed

37 files changed

+1001
-452
lines changed

Documentation/networking/iou-zcrx.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ Create an io_uring instance with the following required setup flags::
7575

7676
IORING_SETUP_SINGLE_ISSUER
7777
IORING_SETUP_DEFER_TASKRUN
78-
IORING_SETUP_CQE32
78+
IORING_SETUP_CQE32 or IORING_SETUP_CQE_MIXED
7979

8080
Create memory area
8181
------------------

block/ioctl.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -776,7 +776,7 @@ static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags)
776776
if (bic->res == -EAGAIN && bic->nowait)
777777
io_uring_cmd_issue_blocking(cmd);
778778
else
779-
io_uring_cmd_done(cmd, bic->res, 0, issue_flags);
779+
io_uring_cmd_done(cmd, bic->res, issue_flags);
780780
}
781781

782782
static void bio_cmd_bio_end_io(struct bio *bio)

drivers/block/ublk_drv.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,7 +1189,7 @@ static void ublk_complete_io_cmd(struct ublk_io *io, struct request *req,
11891189
struct io_uring_cmd *cmd = __ublk_prep_compl_io_cmd(io, req);
11901190

11911191
/* tell ublksrv one io request is coming */
1192-
io_uring_cmd_done(cmd, res, 0, issue_flags);
1192+
io_uring_cmd_done(cmd, res, issue_flags);
11931193
}
11941194

11951195
#define UBLK_REQUEUE_DELAY_MS 3
@@ -1873,7 +1873,7 @@ static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag,
18731873
spin_unlock(&ubq->cancel_lock);
18741874

18751875
if (!done)
1876-
io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0, issue_flags);
1876+
io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, issue_flags);
18771877
}
18781878

18791879
/*
@@ -2520,7 +2520,7 @@ static void ublk_ch_uring_cmd_cb(struct io_uring_cmd *cmd,
25202520
int ret = ublk_ch_uring_cmd_local(cmd, issue_flags);
25212521

25222522
if (ret != -EIOCBQUEUED)
2523-
io_uring_cmd_done(cmd, ret, 0, issue_flags);
2523+
io_uring_cmd_done(cmd, ret, issue_flags);
25242524
}
25252525

25262526
static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)

drivers/nvme/host/ioctl.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ static void nvme_uring_task_cb(struct io_uring_cmd *ioucmd,
410410

411411
if (pdu->bio)
412412
blk_rq_unmap_user(pdu->bio);
413-
io_uring_cmd_done(ioucmd, pdu->status, pdu->result, issue_flags);
413+
io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, issue_flags);
414414
}
415415

416416
static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,

fs/btrfs/ioctl.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4695,7 +4695,7 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss
46954695
btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
46964696
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
46974697

4698-
io_uring_cmd_done(cmd, ret, 0, issue_flags);
4698+
io_uring_cmd_done(cmd, ret, issue_flags);
46994699
add_rchar(current, ret);
47004700

47014701
for (index = 0; index < priv->nr_pages; index++)

fs/fuse/dev_uring.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent)
351351
spin_unlock(&queue->lock);
352352

353353
if (cmd)
354-
io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED);
354+
io_uring_cmd_done(cmd, -ENOTCONN, IO_URING_F_UNLOCKED);
355355

356356
if (req)
357357
fuse_uring_stop_fuse_req_end(req);
@@ -518,7 +518,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd,
518518

519519
if (need_cmd_done) {
520520
/* no queue lock to avoid lock order issues */
521-
io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags);
521+
io_uring_cmd_done(cmd, -ENOTCONN, issue_flags);
522522
}
523523
}
524524

@@ -733,7 +733,7 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent,
733733
list_move_tail(&ent->list, &queue->ent_in_userspace);
734734
spin_unlock(&queue->lock);
735735

736-
io_uring_cmd_done(cmd, 0, 0, issue_flags);
736+
io_uring_cmd_done(cmd, 0, issue_flags);
737737
return 0;
738738
}
739739

@@ -1200,7 +1200,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd,
12001200
ent->cmd = NULL;
12011201
spin_unlock(&queue->lock);
12021202

1203-
io_uring_cmd_done(cmd, ret, 0, issue_flags);
1203+
io_uring_cmd_done(cmd, ret, issue_flags);
12041204
}
12051205

12061206
/*

include/linux/io_uring/cmd.h

Lines changed: 49 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,14 @@
1111
/* io_uring_cmd is being issued again */
1212
#define IORING_URING_CMD_REISSUE (1U << 31)
1313

14+
typedef void (*io_uring_cmd_tw_t)(struct io_uring_cmd *cmd,
15+
unsigned issue_flags);
16+
1417
struct io_uring_cmd {
1518
struct file *file;
1619
const struct io_uring_sqe *sqe;
1720
/* callback to defer completions to task context */
18-
void (*task_work_cb)(struct io_uring_cmd *cmd, unsigned);
21+
io_uring_cmd_tw_t task_work_cb;
1922
u32 cmd_op;
2023
u32 flags;
2124
u8 pdu[32]; /* available inline for free use */
@@ -53,11 +56,11 @@ int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
5356
* Note: the caller should never hard code @issue_flags and is only allowed
5457
* to pass the mask provided by the core io_uring code.
5558
*/
56-
void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, u64 res2,
57-
unsigned issue_flags);
59+
void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret, u64 res2,
60+
unsigned issue_flags, bool is_cqe32);
5861

5962
void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
60-
void (*task_work_cb)(struct io_uring_cmd *, unsigned),
63+
io_uring_cmd_tw_t task_work_cb,
6164
unsigned flags);
6265

6366
/*
@@ -70,6 +73,21 @@ void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
7073
/* Execute the request from a blocking context */
7174
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd);
7275

76+
/*
77+
* Select a buffer from the provided buffer group for multishot uring_cmd.
78+
* Returns the selected buffer address and size.
79+
*/
80+
struct io_br_sel io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd,
81+
unsigned buf_group, size_t *len,
82+
unsigned int issue_flags);
83+
84+
/*
85+
* Complete a multishot uring_cmd event. This will post a CQE to the completion
86+
* queue and update the provided buffer.
87+
*/
88+
bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
89+
struct io_br_sel *sel, unsigned int issue_flags);
90+
7391
#else
7492
static inline int
7593
io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
@@ -86,13 +104,12 @@ static inline int io_uring_cmd_import_fixed_vec(struct io_uring_cmd *ioucmd,
86104
{
87105
return -EOPNOTSUPP;
88106
}
89-
static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
90-
u64 ret2, unsigned issue_flags)
107+
static inline void __io_uring_cmd_done(struct io_uring_cmd *cmd, s32 ret,
108+
u64 ret2, unsigned issue_flags, bool is_cqe32)
91109
{
92110
}
93111
static inline void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
94-
void (*task_work_cb)(struct io_uring_cmd *, unsigned),
95-
unsigned flags)
112+
io_uring_cmd_tw_t task_work_cb, unsigned flags)
96113
{
97114
}
98115
static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
@@ -102,28 +119,28 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
102119
static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
103120
{
104121
}
105-
#endif
106-
107-
/*
108-
* Polled completions must ensure they are coming from a poll queue, and
109-
* hence are completed inside the usual poll handling loops.
110-
*/
111-
static inline void io_uring_cmd_iopoll_done(struct io_uring_cmd *ioucmd,
112-
ssize_t ret, ssize_t res2)
122+
static inline struct io_br_sel
123+
io_uring_cmd_buffer_select(struct io_uring_cmd *ioucmd, unsigned buf_group,
124+
size_t *len, unsigned int issue_flags)
125+
{
126+
return (struct io_br_sel) { .val = -EOPNOTSUPP };
127+
}
128+
static inline bool io_uring_mshot_cmd_post_cqe(struct io_uring_cmd *ioucmd,
129+
struct io_br_sel *sel, unsigned int issue_flags)
113130
{
114-
lockdep_assert(in_task());
115-
io_uring_cmd_done(ioucmd, ret, res2, 0);
131+
return true;
116132
}
133+
#endif
117134

118135
/* users must follow the IOU_F_TWQ_LAZY_WAKE semantics */
119136
static inline void io_uring_cmd_do_in_task_lazy(struct io_uring_cmd *ioucmd,
120-
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
137+
io_uring_cmd_tw_t task_work_cb)
121138
{
122139
__io_uring_cmd_do_in_task(ioucmd, task_work_cb, IOU_F_TWQ_LAZY_WAKE);
123140
}
124141

125142
static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
126-
void (*task_work_cb)(struct io_uring_cmd *, unsigned))
143+
io_uring_cmd_tw_t task_work_cb)
127144
{
128145
__io_uring_cmd_do_in_task(ioucmd, task_work_cb, 0);
129146
}
@@ -142,6 +159,18 @@ static inline void *io_uring_cmd_ctx_handle(struct io_uring_cmd *cmd)
142159
return cmd_to_io_kiocb(cmd)->ctx;
143160
}
144161

162+
static inline void io_uring_cmd_done(struct io_uring_cmd *ioucmd, s32 ret,
163+
unsigned issue_flags)
164+
{
165+
return __io_uring_cmd_done(ioucmd, ret, 0, issue_flags, false);
166+
}
167+
168+
static inline void io_uring_cmd_done32(struct io_uring_cmd *ioucmd, s32 ret,
169+
u64 res2, unsigned issue_flags)
170+
{
171+
return __io_uring_cmd_done(ioucmd, ret, res2, issue_flags, true);
172+
}
173+
145174
int io_buffer_register_bvec(struct io_uring_cmd *cmd, struct request *rq,
146175
void (*release)(void *), unsigned int index,
147176
unsigned int issue_flags);

include/linux/io_uring_types.h

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,25 @@ struct io_mapped_region {
8585
unsigned flags;
8686
};
8787

88+
/*
89+
* Return value from io_buffer_list selection, to avoid stashing it in
90+
* struct io_kiocb. For legacy/classic provided buffers, keeping a reference
91+
* across execution contexts are fine. But for ring provided buffers, the
92+
* list may go away as soon as ->uring_lock is dropped. As the io_kiocb
93+
* persists, it's better to just keep the buffer local for those cases.
94+
*/
95+
struct io_br_sel {
96+
struct io_buffer_list *buf_list;
97+
/*
98+
* Some selection parts return the user address, others return an error.
99+
*/
100+
union {
101+
void __user *addr;
102+
ssize_t val;
103+
};
104+
};
105+
106+
88107
/*
89108
* Arbitrary limit, can be raised if need be
90109
*/
@@ -671,12 +690,6 @@ struct io_kiocb {
671690
/* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
672691
struct io_buffer *kbuf;
673692

674-
/*
675-
* stores buffer ID for ring provided buffers, valid IFF
676-
* REQ_F_BUFFER_RING is set.
677-
*/
678-
struct io_buffer_list *buf_list;
679-
680693
struct io_rsrc_node *buf_node;
681694
};
682695

@@ -724,10 +737,4 @@ struct io_overflow_cqe {
724737
struct list_head list;
725738
struct io_uring_cqe cqe;
726739
};
727-
728-
static inline bool io_ctx_cqe32(struct io_ring_ctx *ctx)
729-
{
730-
return ctx->flags & IORING_SETUP_CQE32;
731-
}
732-
733740
#endif

include/linux/poison.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,4 +90,7 @@
9090
/********** lib/stackdepot.c **********/
9191
#define STACK_DEPOT_POISON ((void *)(0xD390 + POISON_POINTER_DELTA))
9292

93+
/********** io_uring/ **********/
94+
#define IO_URING_PTR_POISON ((void *)(0x1091UL + POISON_POINTER_DELTA))
95+
9396
#endif

include/trace/events/io_uring.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -340,8 +340,8 @@ TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe),
340340
__entry->user_data = cqe->user_data;
341341
__entry->res = cqe->res;
342342
__entry->cflags = cqe->flags;
343-
__entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0;
344-
__entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0;
343+
__entry->extra1 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[0] : 0;
344+
__entry->extra2 = ctx->flags & IORING_SETUP_CQE32 || cqe->flags & IORING_CQE_F_32 ? cqe->big_cqe[1] : 0;
345345
),
346346

347347
TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x "

0 commit comments

Comments
 (0)