@@ -445,214 +445,63 @@ static int check_provider_attr(struct fi_info *provider_info, struct fi_info *pr
445445 }
446446}
447447
448- /**
449- * Calculate device distances
450- *
451- * Calculate the distances between the current thread and all devices of
452- * type OPENFABRICS or NETWORK.
453- *
454- * The shortest distances are the nearest and therefore most efficient
455- * devices to use.
456- *
457- * Return an array of all the distances. Each entry is of type
458- * pmix_device_distance_t
459- *
460- * This function is used if there is no PMIx server running.
461- *
462- * @param distances (OUT) distances array
463- * @param ndist (OUT) number of entries in the distances array
464- *
465- * @return 0 on success. Error otherwise.
466- *
467- */
468- static int compute_dev_distances (pmix_device_distance_t * * distances ,
469- size_t * ndist )
470- {
471- int ret = 0 ;
472- size_t ninfo ;
473- pmix_info_t * info ;
474- pmix_cpuset_t cpuset ;
475- pmix_topology_t * pmix_topo ;
476- pmix_device_type_t type = PMIX_DEVTYPE_OPENFABRICS |
477- PMIX_DEVTYPE_NETWORK ;
478-
479- PMIX_CPUSET_CONSTRUCT (& cpuset );
480- ret = PMIx_Get_cpuset (& cpuset , PMIX_CPUBIND_THREAD );
481- if (PMIX_SUCCESS != ret ) {
482- goto out ;
483- }
484-
485- /* load the PMIX topology */
486- PMIX_TOPOLOGY_CREATE (pmix_topo , 1 );
487- ret = PMIx_Load_topology (pmix_topo );
488- if (PMIX_SUCCESS != ret ) {
489- goto out ;
490- }
491-
492- ninfo = 1 ;
493- PMIX_INFO_CREATE (info , ninfo );
494- PMIX_INFO_LOAD (& info [0 ], PMIX_DEVICE_TYPE , & type , PMIX_DEVTYPE );
495- ret = PMIx_Compute_distances (pmix_topo , & cpuset , info , ninfo , distances ,
496- ndist );
497- PMIX_INFO_FREE (info , ninfo );
498-
499- PMIX_TOPOLOGY_FREE (pmix_topo , 1 );
500- out :
501- return ret ;
502- }
503-
504- /**
505- * Find the nearest devices to the current thread
506- *
507- * Use the PMIx server or calculate the device distances, then out of the set of
508- * returned distances find the subset of the nearest devices. This can be
509- * 1 or more.
510- *
511- * @param num_distances (OUT) number of entries in the returned array
512- *
513- * @return An array of device distances which are nearest this thread
514- * or NULL if we fail to get the distances. In this case we will just
515- * revert to round robin.
516- *
517- */
518- static pmix_device_distance_t * get_nearest_nics (int * num_distances )
519- {
520- size_t ndist ;
521- pmix_topology_t * topo ;
522- int ret , i , idx = 0 ;
523- pmix_data_array_t * dptr ;
524- uint16_t near = USHRT_MAX ;
525- pmix_info_t directive ;
526- pmix_value_t * val = NULL ;
527- pmix_device_distance_t * distances , * nearest = NULL ;
528-
529- PMIX_INFO_LOAD (& directive , PMIX_OPTIONAL , NULL , PMIX_BOOL );
530- ret = PMIx_Get (& opal_process_info .myprocid ,
531- PMIX_DEVICE_DISTANCES , & directive , 1 , & val );
532- PMIX_INFO_DESTRUCT (& directive );
533- if (ret != PMIX_SUCCESS || !val ) {
534- ret = compute_dev_distances (& distances , & ndist );
535- if (ret )
536- goto out ;
537- goto find_nearest ;
538- }
539-
540- if (PMIX_DATA_ARRAY != val -> type ) {
541- goto out ;
542- }
543- dptr = val -> data .darray ;
544- if (NULL == dptr ) {
545- goto out ;
546- }
547- if (PMIX_DEVICE_DIST != dptr -> type ) {
548- goto out ;
549- }
550-
551- distances = (pmix_device_distance_t * )dptr -> array ;
552- ndist = dptr -> size ;
553-
554- find_nearest :
555- nearest = calloc (sizeof (* distances ), ndist );
556- if (!nearest )
557- goto out ;
558-
559- for (i = 0 ; i < ndist ; i ++ ) {
560- if (distances [i ].mindist < near ) {
561- idx = 0 ;
562- near = distances [i ].mindist ;
563- nearest [idx ] = distances [i ];
564- idx ++ ;
565- } else if (distances [i ].mindist == near ) {
566- nearest [idx ] = distances [i ];
567- idx ++ ;
568- }
569- }
570-
571- * num_distances = idx ;
572-
573- out :
574- if (val )
575- PMIX_VALUE_RELEASE (val );
576- return nearest ;
577- }
578-
579448#if OPAL_OFI_PCI_DATA_AVAILABLE
580- /**
581- * Determine if a device is nearest
582- *
583- * Given a device distances array of the nearest pci devices,
584- * determine if one of these device distances refers to the pci
585- * device passed in
449+ /* Check if a process and a pci device share the same cpuset
450+ * @param (IN) pci struct fi_pci_attr pci device attributes,
451+ * used to find hwloc object for device.
586452 *
587- * @param distances (IN) distances array
588- * @param num_distances (IN) number of entries in the distances array
589- * @param topology (IN) topology of the node
590- * @param pci (IN) PCI device being examined
453+ * @param (IN) topology hwloc_topology_t topology to get the cpusets
454+ * from
591455 *
592- * @return true if the PCI device is in the distances array or if the
593- * distances array is not provided. False otherwise.
456+ * @param (OUT) returns true if cpusets match and false if
457+ * cpusets do not match or an error prevents comparison
594458 *
459+ * Uses a pci device to find an ancestor that contains a cpuset, and
460+ * determines if it intersects with the cpuset that the process is bound to.
461+ * if the process is not bound, or if a cpuset is unavailable for whatever
462+ * reason, returns false. Otherwise, returns the result of
463+ * hwloc_cpuset_intersects()
595464 */
596- static bool is_near (pmix_device_distance_t * distances ,
597- int num_distances ,
598- hwloc_topology_t topology ,
599- struct fi_pci_attr pci )
465+ static bool compare_cpusets (hwloc_topology_t topology , struct fi_pci_attr pci )
600466{
601- hwloc_obj_t pcidev , osdev ;
602-
603- /* if we failed to find any distances, then we consider all interfaces
604- * to be of equal distances and let the caller decide how to handle
605- * them
606- */
607- if (!distances )
608- return true;
467+ bool result = false;
468+ int ret ;
469+ hwloc_bitmap_t proc_cpuset ;
470+ hwloc_obj_t obj = NULL ;
609471
610- pcidev = hwloc_get_pcidev_by_busid (topology , pci .domain_id ,
611- pci .bus_id , pci .device_id ,
612- pci .function_id );
613- if (!pcidev )
472+ /* Cannot find topology info if no topology is found */
473+ if (NULL == topology ) {
614474 return false;
475+ }
615476
616- for (osdev = pcidev -> io_first_child ; osdev != NULL ; osdev = osdev -> next_sibling ) {
617- int i ;
618-
619- if (osdev -> attr -> osdev .type == HWLOC_OBJ_OSDEV_OPENFABRICS ) {
620- const char * nguid = hwloc_obj_get_info_by_name (osdev ,"NodeGUID" );
621- const char * sguid = hwloc_obj_get_info_by_name (osdev , "SysImageGUID" );
477+ /* Allocate memory for proc_cpuset */
478+ proc_cpuset = hwloc_bitmap_alloc ();
479+ if (NULL == proc_cpuset ) {
480+ return false;
481+ }
622482
623- if (!nguid && !sguid )
624- continue ;
483+ /* Fill cpuset with the collection of cpu cores that the process runs on */
484+ ret = hwloc_get_cpubind (topology , proc_cpuset , HWLOC_CPUBIND_PROCESS );
485+ if (0 > ret ) {
486+ goto error ;
487+ }
625488
626- for (i = 0 ; i < num_distances ; i ++ ) {
627- char lsguid [256 ], lnguid [256 ];
628- int ret ;
489+ /* Get the pci device from bdf */
490+ obj = hwloc_get_pcidev_by_busid (topology , pci .domain_id , pci .bus_id , pci .device_id ,
491+ pci .function_id );
492+ if (NULL == obj ) {
493+ goto error ;
494+ }
629495
630- ret = sscanf (distances [i ].uuid , "fab://%256s::%256s" , lnguid , lsguid );
631- if (ret != 2 )
632- continue ;
633- if (0 == strcasecmp (lnguid , nguid )) {
634- return true;
635- } else if (0 == strcasecmp (lsguid , sguid )) {
636- return true;
637- }
638- }
639- } else if (osdev -> attr -> osdev .type == HWLOC_OBJ_OSDEV_NETWORK ) {
640- const char * address = hwloc_obj_get_info_by_name (osdev , "Address" );
641- if (!address )
642- continue ;
643- for (i = 0 ; i < num_distances ; i ++ ) {
644- char * addr = strstr (distances [i ].uuid , "://" );
645- if (!addr || addr + 3 > distances [i ].uuid
646- + strlen (distances [i ].uuid ))
647- continue ;
648- if (!strcmp (addr + 3 , address )) {
649- return true;
650- }
651- }
652- }
496+ /* pcidev objects don't have cpusets so find the first non-io object above */
497+ obj = hwloc_get_non_io_ancestor_obj (topology , obj );
498+ if (NULL != obj ) {
499+ result = hwloc_bitmap_intersects (proc_cpuset , obj -> cpuset );
653500 }
654501
655- return false;
502+ error :
503+ hwloc_bitmap_free (proc_cpuset );
504+ return result ;
656505}
657506#endif
658507
@@ -765,10 +614,7 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
765614 struct fi_info * provider = provider_list , * current_provider = provider_list ;
766615 struct fi_info * * provider_table ;
767616#if OPAL_OFI_PCI_DATA_AVAILABLE
768- pmix_device_distance_t * distances = NULL ;
769617 struct fi_pci_attr pci ;
770- int num_distances = 0 ;
771- bool near ;
772618#endif
773619 int ret ;
774620 unsigned int num_provider = 0 , provider_limit = 0 ;
@@ -793,38 +639,33 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
793639 return provider_list ;
794640 }
795641
796- #if OPAL_OFI_PCI_DATA_AVAILABLE
797- /* find all the nearest devices to this thread, then out of these
798- * determine which device we should bind to.
799- */
800- distances = get_nearest_nics (& num_distances );
801- #endif
802-
803642 current_provider = provider ;
804643
805644 /* Cycle through remaining fi_info objects, looking for alike providers */
806645 while (NULL != current_provider ) {
807646 if (!check_provider_attr (provider , current_provider )) {
808- near = false;
647+ cpusets_match = false;
809648#if OPAL_OFI_PCI_DATA_AVAILABLE
810649 if (NULL != current_provider -> nic
811650 && NULL != current_provider -> nic -> bus_attr
812651 && current_provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
813652 pci = current_provider -> nic -> bus_attr -> attr .pci ;
814- near = is_near (distances , num_distances ,
815- opal_hwloc_topology , pci );
653+ cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
816654 }
817655#endif
818- /* We could have multiple near providers */
819- if (near && !provider_found ) {
656+
657+ /* Reset the list if the cpusets match and no other provider was
658+ * found on the same cpuset as the process.
659+ */
660+ if (cpusets_match && !provider_found ) {
820661 provider_found = true;
821662 num_provider = 0 ;
822663 }
823664
824665 /* Add the provider to the provider list if the cpusets match or if
825666 * no other provider was found on the same cpuset as the process.
826667 */
827- if (near || !provider_found ) {
668+ if (cpusets_match || !provider_found ) {
828669 provider_table [num_provider ] = current_provider ;
829670 num_provider ++ ;
830671 }
@@ -846,22 +687,17 @@ struct fi_info *opal_common_ofi_select_provider(struct fi_info *provider_list,
846687 && NULL != provider -> nic -> bus_attr
847688 && provider -> nic -> bus_attr -> bus_type == FI_BUS_PCI ) {
848689 pci = provider -> nic -> bus_attr -> attr .pci ;
849- near = is_near (distances , num_distances ,
850- opal_hwloc_topology , pci );
690+ cpusets_match = compare_cpusets (opal_hwloc_topology , pci );
851691 }
852692#endif
853693
854694#if OPAL_ENABLE_DEBUG
855695 opal_output_verbose (1 , opal_common_ofi .output ,
856- "package rank: %d device: %s near : %s\n" , package_rank ,
857- provider -> domain_attr -> name , near ? "true" : "false" );
696+ "package rank: %d device: %s cpusets match : %s\n" , package_rank ,
697+ provider -> domain_attr -> name , cpusets_match ? "true" : "false" );
858698#endif
859699
860700 free (provider_table );
861- #if OPAL_OFI_PCI_DATA_AVAILABLE
862- if (distances )
863- free (distances );
864- #endif
865701 return provider ;
866702}
867703
0 commit comments