Skip to content

Commit ba42e9d

Browse files
committed
osc: Fix rdma component when not using ob1
When the ob1 PML was not eligible for selection (such as when the user sets --mca pml cm), the BML and BTL frameworks are not initialized and the rdma osc component will later fail as there are no BTLs available. This patch resolves the issue by having the rdma osc component initialize the BML interface. Making this change required two additional, related changes. First, since the BTLs use the modex, the rdma initialization must be moved before the modex point, so that putting data in the modex works as expected. Second, BTLs can require loading the entire world during init (such as TCP when there are multiple threads and multiple NICs or usnic), so we extend the world loading checks to include OSC. Since the other Portals4 components say that they do require world loading, we also assume the Portals4 osc component also requires world loading. Signed-off-by: Brian Barrett <bbarrett@amazon.com> (cherry picked from commit 4215325)
1 parent 66fbfa5 commit ba42e9d

File tree

5 files changed

+41
-6
lines changed

5 files changed

+41
-6
lines changed

ompi/instance/instance.c

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,10 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
535535
return ompi_instance_print_error ("mca_pml_base_select() failed", ret);
536536
}
537537

538+
if (OMPI_SUCCESS != (ret = ompi_osc_base_find_available (OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) {
539+
return ompi_instance_print_error ("ompi_osc_base_find_available() failed", ret);
540+
}
541+
538542
OMPI_TIMING_IMPORT_OPAL("orte_init");
539543
OMPI_TIMING_NEXT("rte_init-commit");
540544

@@ -616,10 +620,6 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
616620
return ompi_instance_print_error ("mca_coll_base_find_available() failed", ret);
617621
}
618622

619-
if (OMPI_SUCCESS != (ret = ompi_osc_base_find_available (OPAL_ENABLE_PROGRESS_THREADS, ompi_mpi_thread_multiple))) {
620-
return ompi_instance_print_error ("ompi_osc_base_find_available() failed", ret);
621-
}
622-
623623
/* io and topo components are not selected here -- see comment
624624
above about the io and topo frameworks being loaded lazily */
625625

@@ -653,7 +653,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
653653
return ompi_instance_print_error ("ompi_attr_create_predefined_keyvals() failed", ret);
654654
}
655655

656-
if (mca_pml_base_requires_world ()) {
656+
if (mca_pml_base_requires_world() ||
657+
mca_osc_base_requires_world()) {
657658
/* need to set up comm world for this instance -- XXX -- FIXME -- probably won't always
658659
* be the case. */
659660
if (OMPI_SUCCESS != (ret = ompi_comm_init_mpi3 ())) {
@@ -702,7 +703,8 @@ static int ompi_mpi_instance_init_common (int argc, char **argv)
702703
/* some btls/mtls require we call add_procs with all procs in the job.
703704
* since the btls/mtls have no visibility here it is up to the pml to
704705
* convey this requirement */
705-
if (mca_pml_base_requires_world ()) {
706+
if (mca_pml_base_requires_world() ||
707+
mca_osc_base_requires_world()) {
706708
if (NULL == (procs = ompi_proc_world (&nprocs))) {
707709
return ompi_instance_print_error ("ompi_proc_get_allocated () failed", ret);
708710
}

ompi/mca/osc/base/osc_base_init.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
#include "ompi/communicator/communicator.h"
3131
#include "ompi/win/win.h"
3232

33+
bool ompi_osc_base_requires_world = false;
34+
3335
int
3436
ompi_osc_base_select(ompi_win_t *win,
3537
void **base,

ompi/mca/osc/osc.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,9 @@ struct ompi_datatype_t;
5353
struct ompi_op_t;
5454
struct ompi_request_t;
5555

56+
57+
extern bool ompi_osc_base_requires_world;
58+
5659
/* ******************************************************************** */
5760

5861

@@ -419,6 +422,11 @@ typedef ompi_osc_base_module_3_0_0_t ompi_osc_base_module_t;
419422

420423
/* ******************************************************************** */
421424

425+
static inline bool mca_osc_base_requires_world (void)
426+
{
427+
return ompi_osc_base_requires_world;
428+
}
429+
422430

423431
END_C_DECLS
424432

ompi/mca/osc/portals4/osc_portals4_component.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,8 @@ component_init(bool enable_progress_threads, bool enable_mpi_threads)
348348
return ret;
349349
}
350350

351+
ompi_osc_base_requires_world = true;
352+
351353
return OMPI_SUCCESS;
352354
}
353355

ompi/mca/osc/rdma/osc_rdma_component.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,27 @@ static int ompi_osc_rdma_component_init (bool enable_progress_threads,
344344
__FILE__, __LINE__, ret);
345345
}
346346

347+
ret = mca_bml_base_init(enable_progress_threads, enable_mpi_threads);
348+
if (OPAL_SUCCESS != ret) {
349+
opal_output_verbose(1, ompi_osc_base_framework.framework_output,
350+
"%s:%d: bml_base_init() failed: %d",
351+
__FILE__, __LINE__, ret);
352+
return ret;
353+
}
354+
355+
/* check if any btls do not support dynamic add_procs */
356+
mca_btl_base_selected_module_t* selected_btl;
357+
OPAL_LIST_FOREACH(selected_btl, &mca_btl_base_modules_initialized,
358+
mca_btl_base_selected_module_t) {
359+
mca_btl_base_module_t *btl = selected_btl->btl_module;
360+
361+
if (btl->btl_flags & MCA_BTL_FLAGS_SINGLE_ADD_PROCS) {
362+
ompi_osc_base_requires_world = true;
363+
break;
364+
}
365+
366+
}
367+
347368
return ret;
348369
}
349370

0 commit comments

Comments
 (0)