Skip to content

Commit eeeb721

Browse files
committed
Merge: nvme-multipath: implement "queue-depth" iopolicy
MR: https://gitlab.com/redhat/centos-stream/src/kernel/centos-stream-9/-/merge_requests/4784 JIRA: https://issues.redhat.com/browse/RHEL-45230 Upstream Status: git://git.infradead.org/nvme.git branch "nvme-6.11" The round-robin path selector is inefficient in cases where there is a difference in latency between paths. In the presence of one or more high latency paths the round-robin selector continues to use the high latency path equally. This results in a bias towards the highest latency path and can cause a significant decrease in overall performance as IOs pile on the highest latency path. This problem is acute with NVMe-oF controllers. The queue-depth path selector sends I/O down the path with the lowest number of requests in its request queue. Paths with lower latency will clear requests more quickly and have less requests queued compared to higher latency paths. The goal of this path selector is to make more use of lower latency paths which will bring down overall IO latency and increase throughput and performance. Signed-off-by: John Meneghini <jmeneghi@redhat.com> Approved-by: Ewan D. Milne <emilne@redhat.com> Approved-by: Chris Leech <cleech@redhat.com> Approved-by: CKI KWF Bot <cki-ci-bot+kwf-gitlab-com@redhat.com> Merged-by: Lucas Zampieri <lzampier@redhat.com>
2 parents 0c94038 + 8a86570 commit eeeb721

File tree

3 files changed

+100
-9
lines changed

3 files changed

+100
-9
lines changed

drivers/nvme/host/core.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ struct workqueue_struct *nvme_delete_wq;
110110
EXPORT_SYMBOL_GPL(nvme_delete_wq);
111111

112112
static LIST_HEAD(nvme_subsystems);
113-
static DEFINE_MUTEX(nvme_subsystems_lock);
113+
DEFINE_MUTEX(nvme_subsystems_lock);
114114

115115
static DEFINE_IDA(nvme_instance_ida);
116116
static dev_t nvme_ctrl_base_chr_devt;

drivers/nvme/host/multipath.c

Lines changed: 95 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ MODULE_PARM_DESC(multipath,
1717
static const char *nvme_iopolicy_names[] = {
1818
[NVME_IOPOLICY_NUMA] = "numa",
1919
[NVME_IOPOLICY_RR] = "round-robin",
20+
[NVME_IOPOLICY_QD] = "queue-depth",
2021
};
2122

2223
static int iopolicy = NVME_IOPOLICY_NUMA;
@@ -29,6 +30,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
2930
iopolicy = NVME_IOPOLICY_NUMA;
3031
else if (!strncmp(val, "round-robin", 11))
3132
iopolicy = NVME_IOPOLICY_RR;
33+
else if (!strncmp(val, "queue-depth", 11))
34+
iopolicy = NVME_IOPOLICY_QD;
3235
else
3336
return -EINVAL;
3437

@@ -43,7 +46,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
4346
module_param_call(iopolicy, nvme_set_iopolicy, nvme_get_iopolicy,
4447
&iopolicy, 0644);
4548
MODULE_PARM_DESC(iopolicy,
46-
"Default multipath I/O policy; 'numa' (default) or 'round-robin'");
49+
"Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
4750

4851
void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
4952
{
@@ -133,6 +136,11 @@ void nvme_mpath_start_request(struct request *rq)
133136
struct nvme_ns *ns = rq->q->queuedata;
134137
struct gendisk *disk = ns->head->disk;
135138

139+
if (READ_ONCE(ns->head->subsys->iopolicy) == NVME_IOPOLICY_QD) {
140+
atomic_inc(&ns->ctrl->nr_active);
141+
nvme_req(rq)->flags |= NVME_MPATH_CNT_ACTIVE;
142+
}
143+
136144
if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
137145
return;
138146

@@ -146,6 +154,9 @@ void nvme_mpath_end_request(struct request *rq)
146154
{
147155
struct nvme_ns *ns = rq->q->queuedata;
148156

157+
if (nvme_req(rq)->flags & NVME_MPATH_CNT_ACTIVE)
158+
atomic_dec_if_positive(&ns->ctrl->nr_active);
159+
149160
if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
150161
return;
151162
bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
@@ -296,10 +307,15 @@ static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
296307
return list_first_or_null_rcu(&head->list, struct nvme_ns, siblings);
297308
}
298309

299-
static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
300-
int node, struct nvme_ns *old)
310+
static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head)
301311
{
302312
struct nvme_ns *ns, *found = NULL;
313+
int node = numa_node_id();
314+
struct nvme_ns *old = srcu_dereference(head->current_path[node],
315+
&head->srcu);
316+
317+
if (unlikely(!old))
318+
return __nvme_find_path(head, node);
303319

304320
if (list_is_singular(&head->list)) {
305321
if (nvme_path_is_disabled(old))
@@ -339,28 +355,73 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
339355
return found;
340356
}
341357

358+
static struct nvme_ns *nvme_queue_depth_path(struct nvme_ns_head *head)
359+
{
360+
struct nvme_ns *best_opt = NULL, *best_nonopt = NULL, *ns;
361+
unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
362+
unsigned int depth;
363+
364+
list_for_each_entry_rcu(ns, &head->list, siblings) {
365+
if (nvme_path_is_disabled(ns))
366+
continue;
367+
368+
depth = atomic_read(&ns->ctrl->nr_active);
369+
370+
switch (ns->ana_state) {
371+
case NVME_ANA_OPTIMIZED:
372+
if (depth < min_depth_opt) {
373+
min_depth_opt = depth;
374+
best_opt = ns;
375+
}
376+
break;
377+
case NVME_ANA_NONOPTIMIZED:
378+
if (depth < min_depth_nonopt) {
379+
min_depth_nonopt = depth;
380+
best_nonopt = ns;
381+
}
382+
break;
383+
default:
384+
break;
385+
}
386+
387+
if (min_depth_opt == 0)
388+
return best_opt;
389+
}
390+
391+
return best_opt ? best_opt : best_nonopt;
392+
}
393+
342394
static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
343395
{
344396
return nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE &&
345397
ns->ana_state == NVME_ANA_OPTIMIZED;
346398
}
347399

348-
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
400+
static struct nvme_ns *nvme_numa_path(struct nvme_ns_head *head)
349401
{
350402
int node = numa_node_id();
351403
struct nvme_ns *ns;
352404

353405
ns = srcu_dereference(head->current_path[node], &head->srcu);
354406
if (unlikely(!ns))
355407
return __nvme_find_path(head, node);
356-
357-
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
358-
return nvme_round_robin_path(head, node, ns);
359408
if (unlikely(!nvme_path_is_optimized(ns)))
360409
return __nvme_find_path(head, node);
361410
return ns;
362411
}
363412

413+
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
414+
{
415+
switch (READ_ONCE(head->subsys->iopolicy)) {
416+
case NVME_IOPOLICY_QD:
417+
return nvme_queue_depth_path(head);
418+
case NVME_IOPOLICY_RR:
419+
return nvme_round_robin_path(head);
420+
default:
421+
return nvme_numa_path(head);
422+
}
423+
}
424+
364425
static bool nvme_available_path(struct nvme_ns_head *head)
365426
{
366427
struct nvme_ns *ns;
@@ -807,6 +868,29 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
807868
nvme_iopolicy_names[READ_ONCE(subsys->iopolicy)]);
808869
}
809870

871+
static void nvme_subsys_iopolicy_update(struct nvme_subsystem *subsys,
872+
int iopolicy)
873+
{
874+
struct nvme_ctrl *ctrl;
875+
int old_iopolicy = READ_ONCE(subsys->iopolicy);
876+
877+
if (old_iopolicy == iopolicy)
878+
return;
879+
880+
WRITE_ONCE(subsys->iopolicy, iopolicy);
881+
882+
/* iopolicy changes clear the mpath by design */
883+
mutex_lock(&nvme_subsystems_lock);
884+
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
885+
nvme_mpath_clear_ctrl_paths(ctrl);
886+
mutex_unlock(&nvme_subsystems_lock);
887+
888+
pr_notice("subsysnqn %s iopolicy changed from %s to %s\n",
889+
subsys->subnqn,
890+
nvme_iopolicy_names[old_iopolicy],
891+
nvme_iopolicy_names[iopolicy]);
892+
}
893+
810894
static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
811895
struct device_attribute *attr, const char *buf, size_t count)
812896
{
@@ -816,7 +900,7 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
816900

817901
for (i = 0; i < ARRAY_SIZE(nvme_iopolicy_names); i++) {
818902
if (sysfs_streq(buf, nvme_iopolicy_names[i])) {
819-
WRITE_ONCE(subsys->iopolicy, i);
903+
nvme_subsys_iopolicy_update(subsys, i);
820904
return count;
821905
}
822906
}
@@ -926,6 +1010,9 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
9261010
if (!ctrl->subsys || !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
9271011
return 0;
9281012

1013+
/* initialize this in the identify path to cover controller resets */
1014+
atomic_set(&ctrl->nr_active, 0);
1015+
9291016
if (!ctrl->max_namespaces ||
9301017
ctrl->max_namespaces > le32_to_cpu(id->nn)) {
9311018
dev_err(ctrl->device,

drivers/nvme/host/nvme.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ extern unsigned int admin_timeout;
4949
extern struct workqueue_struct *nvme_wq;
5050
extern struct workqueue_struct *nvme_reset_wq;
5151
extern struct workqueue_struct *nvme_delete_wq;
52+
extern struct mutex nvme_subsystems_lock;
5253

5354
/*
5455
* List of workarounds for devices that required behavior not specified in
@@ -190,6 +191,7 @@ enum {
190191
NVME_REQ_CANCELLED = (1 << 0),
191192
NVME_REQ_USERCMD = (1 << 1),
192193
NVME_MPATH_IO_STATS = (1 << 2),
194+
NVME_MPATH_CNT_ACTIVE = (1 << 3),
193195
};
194196

195197
static inline struct nvme_request *nvme_req(struct request *req)
@@ -356,6 +358,7 @@ struct nvme_ctrl {
356358
size_t ana_log_size;
357359
struct timer_list anatt_timer;
358360
struct work_struct ana_work;
361+
atomic_t nr_active;
359362
#endif
360363

361364
#ifdef CONFIG_NVME_HOST_AUTH
@@ -404,6 +407,7 @@ static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
404407
enum nvme_iopolicy {
405408
NVME_IOPOLICY_NUMA,
406409
NVME_IOPOLICY_RR,
410+
NVME_IOPOLICY_QD,
407411
};
408412

409413
struct nvme_subsystem {

0 commit comments

Comments
 (0)