@@ -17,6 +17,7 @@ MODULE_PARM_DESC(multipath,
1717static const char * nvme_iopolicy_names [] = {
1818 [NVME_IOPOLICY_NUMA ] = "numa" ,
1919 [NVME_IOPOLICY_RR ] = "round-robin" ,
20+ [NVME_IOPOLICY_QD ] = "queue-depth" ,
2021};
2122
2223static int iopolicy = NVME_IOPOLICY_NUMA ;
@@ -29,6 +30,8 @@ static int nvme_set_iopolicy(const char *val, const struct kernel_param *kp)
2930 iopolicy = NVME_IOPOLICY_NUMA ;
3031 else if (!strncmp (val , "round-robin" , 11 ))
3132 iopolicy = NVME_IOPOLICY_RR ;
33+ else if (!strncmp (val , "queue-depth" , 11 ))
34+ iopolicy = NVME_IOPOLICY_QD ;
3235 else
3336 return - EINVAL ;
3437
@@ -43,7 +46,7 @@ static int nvme_get_iopolicy(char *buf, const struct kernel_param *kp)
4346module_param_call (iopolicy , nvme_set_iopolicy , nvme_get_iopolicy ,
4447 & iopolicy , 0644 );
4548MODULE_PARM_DESC (iopolicy ,
46- "Default multipath I/O policy; 'numa' (default) or 'round-robin'" );
49+ "Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth '" );
4750
4851void nvme_mpath_default_iopolicy (struct nvme_subsystem * subsys )
4952{
@@ -133,6 +136,11 @@ void nvme_mpath_start_request(struct request *rq)
133136 struct nvme_ns * ns = rq -> q -> queuedata ;
134137 struct gendisk * disk = ns -> head -> disk ;
135138
139+ if (READ_ONCE (ns -> head -> subsys -> iopolicy ) == NVME_IOPOLICY_QD ) {
140+ atomic_inc (& ns -> ctrl -> nr_active );
141+ nvme_req (rq )-> flags |= NVME_MPATH_CNT_ACTIVE ;
142+ }
143+
136144 if (!blk_queue_io_stat (disk -> queue ) || blk_rq_is_passthrough (rq ))
137145 return ;
138146
@@ -146,6 +154,9 @@ void nvme_mpath_end_request(struct request *rq)
146154{
147155 struct nvme_ns * ns = rq -> q -> queuedata ;
148156
157+ if (nvme_req (rq )-> flags & NVME_MPATH_CNT_ACTIVE )
158+ atomic_dec_if_positive (& ns -> ctrl -> nr_active );
159+
149160 if (!(nvme_req (rq )-> flags & NVME_MPATH_IO_STATS ))
150161 return ;
151162 bdev_end_io_acct (ns -> head -> disk -> part0 , req_op (rq ),
@@ -296,10 +307,15 @@ static struct nvme_ns *nvme_next_ns(struct nvme_ns_head *head,
296307 return list_first_or_null_rcu (& head -> list , struct nvme_ns , siblings );
297308}
298309
299- static struct nvme_ns * nvme_round_robin_path (struct nvme_ns_head * head ,
300- int node , struct nvme_ns * old )
310+ static struct nvme_ns * nvme_round_robin_path (struct nvme_ns_head * head )
301311{
302312 struct nvme_ns * ns , * found = NULL ;
313+ int node = numa_node_id ();
314+ struct nvme_ns * old = srcu_dereference (head -> current_path [node ],
315+ & head -> srcu );
316+
317+ if (unlikely (!old ))
318+ return __nvme_find_path (head , node );
303319
304320 if (list_is_singular (& head -> list )) {
305321 if (nvme_path_is_disabled (old ))
@@ -339,28 +355,73 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
339355 return found ;
340356}
341357
358+ static struct nvme_ns * nvme_queue_depth_path (struct nvme_ns_head * head )
359+ {
360+ struct nvme_ns * best_opt = NULL , * best_nonopt = NULL , * ns ;
361+ unsigned int min_depth_opt = UINT_MAX , min_depth_nonopt = UINT_MAX ;
362+ unsigned int depth ;
363+
364+ list_for_each_entry_rcu (ns , & head -> list , siblings ) {
365+ if (nvme_path_is_disabled (ns ))
366+ continue ;
367+
368+ depth = atomic_read (& ns -> ctrl -> nr_active );
369+
370+ switch (ns -> ana_state ) {
371+ case NVME_ANA_OPTIMIZED :
372+ if (depth < min_depth_opt ) {
373+ min_depth_opt = depth ;
374+ best_opt = ns ;
375+ }
376+ break ;
377+ case NVME_ANA_NONOPTIMIZED :
378+ if (depth < min_depth_nonopt ) {
379+ min_depth_nonopt = depth ;
380+ best_nonopt = ns ;
381+ }
382+ break ;
383+ default :
384+ break ;
385+ }
386+
387+ if (min_depth_opt == 0 )
388+ return best_opt ;
389+ }
390+
391+ return best_opt ? best_opt : best_nonopt ;
392+ }
393+
342394static inline bool nvme_path_is_optimized (struct nvme_ns * ns )
343395{
344396 return nvme_ctrl_state (ns -> ctrl ) == NVME_CTRL_LIVE &&
345397 ns -> ana_state == NVME_ANA_OPTIMIZED ;
346398}
347399
348- inline struct nvme_ns * nvme_find_path (struct nvme_ns_head * head )
400+ static struct nvme_ns * nvme_numa_path (struct nvme_ns_head * head )
349401{
350402 int node = numa_node_id ();
351403 struct nvme_ns * ns ;
352404
353405 ns = srcu_dereference (head -> current_path [node ], & head -> srcu );
354406 if (unlikely (!ns ))
355407 return __nvme_find_path (head , node );
356-
357- if (READ_ONCE (head -> subsys -> iopolicy ) == NVME_IOPOLICY_RR )
358- return nvme_round_robin_path (head , node , ns );
359408 if (unlikely (!nvme_path_is_optimized (ns )))
360409 return __nvme_find_path (head , node );
361410 return ns ;
362411}
363412
413+ inline struct nvme_ns * nvme_find_path (struct nvme_ns_head * head )
414+ {
415+ switch (READ_ONCE (head -> subsys -> iopolicy )) {
416+ case NVME_IOPOLICY_QD :
417+ return nvme_queue_depth_path (head );
418+ case NVME_IOPOLICY_RR :
419+ return nvme_round_robin_path (head );
420+ default :
421+ return nvme_numa_path (head );
422+ }
423+ }
424+
364425static bool nvme_available_path (struct nvme_ns_head * head )
365426{
366427 struct nvme_ns * ns ;
@@ -807,6 +868,29 @@ static ssize_t nvme_subsys_iopolicy_show(struct device *dev,
807868 nvme_iopolicy_names [READ_ONCE (subsys -> iopolicy )]);
808869}
809870
871+ static void nvme_subsys_iopolicy_update (struct nvme_subsystem * subsys ,
872+ int iopolicy )
873+ {
874+ struct nvme_ctrl * ctrl ;
875+ int old_iopolicy = READ_ONCE (subsys -> iopolicy );
876+
877+ if (old_iopolicy == iopolicy )
878+ return ;
879+
880+ WRITE_ONCE (subsys -> iopolicy , iopolicy );
881+
882+ /* iopolicy changes clear the mpath by design */
883+ mutex_lock (& nvme_subsystems_lock );
884+ list_for_each_entry (ctrl , & subsys -> ctrls , subsys_entry )
885+ nvme_mpath_clear_ctrl_paths (ctrl );
886+ mutex_unlock (& nvme_subsystems_lock );
887+
888+ pr_notice ("subsysnqn %s iopolicy changed from %s to %s\n" ,
889+ subsys -> subnqn ,
890+ nvme_iopolicy_names [old_iopolicy ],
891+ nvme_iopolicy_names [iopolicy ]);
892+ }
893+
810894static ssize_t nvme_subsys_iopolicy_store (struct device * dev ,
811895 struct device_attribute * attr , const char * buf , size_t count )
812896{
@@ -816,7 +900,7 @@ static ssize_t nvme_subsys_iopolicy_store(struct device *dev,
816900
817901 for (i = 0 ; i < ARRAY_SIZE (nvme_iopolicy_names ); i ++ ) {
818902 if (sysfs_streq (buf , nvme_iopolicy_names [i ])) {
819- WRITE_ONCE (subsys -> iopolicy , i );
903+ nvme_subsys_iopolicy_update (subsys , i );
820904 return count ;
821905 }
822906 }
@@ -926,6 +1010,9 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
9261010 if (!ctrl -> subsys || !(ctrl -> subsys -> cmic & NVME_CTRL_CMIC_ANA ))
9271011 return 0 ;
9281012
1013+ /* initialize this in the identify path to cover controller resets */
1014+ atomic_set (& ctrl -> nr_active , 0 );
1015+
9291016 if (!ctrl -> max_namespaces ||
9301017 ctrl -> max_namespaces > le32_to_cpu (id -> nn )) {
9311018 dev_err (ctrl -> device ,
0 commit comments