55 * Copyright (c) 2004-2017 The University of Tennessee and The University
66 * of Tennessee Research Foundation. All rights
77 * reserved.
8- * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
8+ * Copyright (c) 2004-2020 High Performance Computing Center Stuttgart,
99 * University of Stuttgart. All rights reserved.
1010 * Copyright (c) 2004-2005 The Regents of the University of California.
1111 * All rights reserved.
5454#include "opal/mca/common/cuda/common_cuda.h"
5555#endif /* OPAL_CUDA_SUPPORT */
5656#include "opal/util/info_subscriber.h"
57+ #include "opal/mca/mpool/base/base.h"
5758
5859#include "ompi/info/info.h"
5960#include "ompi/communicator/communicator.h"
@@ -305,6 +306,16 @@ static int ompi_osc_rdma_component_register (void)
305306 MCA_BASE_VAR_TYPE_UNSIGNED_LONG , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
306307 MCA_BASE_VAR_SCOPE_LOCAL , & mca_osc_rdma_component .network_amo_max_count );
307308
309+ mca_osc_rdma_component .memory_alignment = opal_getpagesize ();
310+ opal_asprintf (& description_str , "The minimum memory alignment used to allocate local window memory (default: %zu). "
311+ "This is a best effort approach. Alignments larger than the page size may not be supported." ,
312+ mca_osc_rdma_component .memory_alignment );
313+ (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version , "minimum_memory_alignment" ,
314+ description_str ,
315+ MCA_BASE_VAR_TYPE_SIZE_T , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
316+ MCA_BASE_VAR_SCOPE_READONLY , & mca_osc_rdma_component .memory_alignment );
317+ free (description_str );
318+
308319 /* register performance variables */
309320
310321 (void ) mca_base_component_pvar_register (& mca_osc_rdma_component .super .osc_version , "put_retry_count" ,
@@ -390,7 +401,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
390401{
391402
392403 if (MPI_WIN_FLAVOR_SHARED == flavor ) {
393- return -1 ;
404+ return OMPI_ERR_RMA_SHARED ;
394405 }
395406
396407#if OPAL_CUDA_SUPPORT
@@ -448,9 +459,10 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
448459
449460static int allocate_state_single (ompi_osc_rdma_module_t * module , void * * base , size_t size )
450461{
451- size_t total_size , local_rank_array_size , leader_peer_data_size ;
462+ size_t total_size , local_rank_array_size , leader_peer_data_size , base_data_size ;
452463 ompi_osc_rdma_peer_t * my_peer ;
453464 int ret , my_rank ;
465+ size_t memory_alignment = module -> memory_alignment ;
454466
455467 OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "allocating private internal state" );
456468
@@ -463,32 +475,34 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
463475 * registration handles needed to access this data. */
464476 total_size = local_rank_array_size + module -> region_size +
465477 module -> state_size + leader_peer_data_size ;
466- total_size += OPAL_ALIGN_PAD_AMOUNT ( total_size , OPAL_ALIGN_MIN ) ;
478+ base_data_size = total_size ;
467479
468480 if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
469- total_size += size ;
481+ base_data_size += OPAL_ALIGN_PAD_AMOUNT (base_data_size , memory_alignment );
482+ total_size = base_data_size + size ;
470483 }
471484
472485 /* the local data is ordered as follows: rank array (leader, offset mapping), state, leader peer data, and base
473486 * (if using MPI_Win_allocate). In this case the leader peer data array does not need to be stored in the same
474487 * segment but placing it there simplifies the peer data fetch and cleanup code. */
475488
476- module -> rank_array = calloc (total_size , 1 );
489+ module -> rank_array = mca_mpool_base_default_module -> mpool_alloc (mca_mpool_base_default_module , total_size ,
490+ memory_alignment , 0 );
477491 if (OPAL_UNLIKELY (NULL == module -> rank_array )) {
478492 return OMPI_ERR_OUT_OF_RESOURCE ;
479493 }
480494
481- // Note, the extra module->region_size space added after local_rank_array_size
482- // is unused but is there to match what happens in allocte_state_shared()
483- // This allows module->state_offset to be uniform across the ranks which
484- // is part of how they pull peer info from each other.
495+ /* Note, the extra module->region_size space added after local_rank_array_size
496+ * is unused but is there to match what happens in allocte_state_shared()
497+ * This allows module->state_offset to be uniform across the ranks which
498+ * is part of how they pull peer info from each other. */
485499 module -> state_offset = local_rank_array_size + module -> region_size ;
486500
487501 module -> state = (ompi_osc_rdma_state_t * ) ((intptr_t ) module -> rank_array + module -> state_offset );
488502 module -> node_comm_info = (unsigned char * ) ((intptr_t ) module -> state + module -> state_size );
489503
490504 if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
491- * base = (void * ) ((intptr_t ) module -> node_comm_info + leader_peer_data_size );
505+ * base = (void * ) ((intptr_t ) module -> rank_array + base_data_size );
492506 }
493507
494508 /* just go ahead and register the whole segment */
@@ -583,7 +597,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
583597 ompi_osc_rdma_region_t * state_region ;
584598 struct _local_data * temp ;
585599 char * data_file ;
586- int page_size = opal_getpagesize () ;
600+ size_t memory_alignment = module -> memory_alignment ;
587601
588602 shared_comm = module -> shared_comm ;
589603
@@ -620,8 +634,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
620634
621635 /* ensure proper alignment */
622636 if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
623- data_base += OPAL_ALIGN_PAD_AMOUNT (data_base , page_size );
624- size += OPAL_ALIGN_PAD_AMOUNT (size , page_size );
637+ data_base += OPAL_ALIGN_PAD_AMOUNT (data_base , memory_alignment );
638+ size += OPAL_ALIGN_PAD_AMOUNT (size , memory_alignment );
625639 }
626640
627641 do {
@@ -649,6 +663,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
649663 my_base_offset = total_size ;
650664 }
651665 total_size += temp [i ].size ;
666+ total_size += OPAL_ALIGN_PAD_AMOUNT (total_size , memory_alignment );
652667 }
653668 }
654669
@@ -660,12 +675,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
660675 if (0 > ret ) {
661676 ret = OMPI_ERR_OUT_OF_RESOURCE ;
662677 } else {
663- /* allocate enough space for the state + data for all local ranks */
664- ret = opal_shmem_segment_create (& module -> seg_ds , data_file , total_size );
665- free (data_file );
666- if (OPAL_SUCCESS != ret ) {
667- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to create shared memory segment" );
668- }
678+ /* allocate enough space for the state + data for all local ranks */
679+ ret = opal_shmem_segment_create (& module -> seg_ds , data_file , total_size );
680+ free (data_file );
681+ if (OPAL_SUCCESS != ret ) {
682+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to create shared memory segment" );
683+ }
669684 }
670685 }
671686
@@ -692,6 +707,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
692707 }
693708
694709 if (size && MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
710+ size_t page_size = opal_getpagesize ();
695711 char * baseptr = (char * )((intptr_t ) module -> segment_base + my_base_offset );
696712 * base = (void * )baseptr ;
697713 // touch each page to force allocation on local NUMA node
@@ -795,7 +811,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
795811 }
796812
797813 if (my_rank == peer_rank ) {
798- module -> my_peer = peer ;
814+ module -> my_peer = peer ;
799815 }
800816
801817 if (MPI_WIN_FLAVOR_DYNAMIC != module -> flavor && MPI_WIN_FLAVOR_CREATE != module -> flavor &&
@@ -855,12 +871,12 @@ static int ompi_osc_rdma_query_mtls (void)
855871
856872 mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names , ',' );
857873 if (mtls_to_use && ompi_mtl_base_selected_component ) {
858- for (int i = 0 ; mtls_to_use [i ] ; ++ i ) {
859- if (0 == strcmp (mtls_to_use [i ], ompi_mtl_base_selected_component -> mtl_version .mca_component_name )) {
860- opal_argv_free (mtls_to_use );
861- return OMPI_SUCCESS ;
862- }
863- }
874+ for (int i = 0 ; mtls_to_use [i ] ; ++ i ) {
875+ if (0 == strcmp (mtls_to_use [i ], ompi_mtl_base_selected_component -> mtl_version .mca_component_name )) {
876+ opal_argv_free (mtls_to_use );
877+ return OMPI_SUCCESS ;
878+ }
879+ }
864880 }
865881 opal_argv_free (mtls_to_use );
866882 return -1 ;
@@ -1305,6 +1321,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
13051321 int world_size = ompi_comm_size (comm );
13061322 int init_limit = 256 ;
13071323 int ret ;
1324+ int flag ;
1325+ char infoval [32 ];
13081326 char * name ;
13091327
13101328 /* the osc/sm component is the exclusive provider for support for shared
@@ -1343,6 +1361,18 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
13431361 module -> win = win ;
13441362 module -> disp_unit = disp_unit ;
13451363 module -> size = size ;
1364+ module -> memory_alignment = mca_osc_rdma_component .memory_alignment ;
1365+ if (NULL != info ) {
1366+ opal_cstring_t * align_info_str ;
1367+ opal_info_get (info , "mpi_minimum_memory_alignment" , & align_info_str , & flag );
1368+ if (flag ) {
1369+ ssize_t tmp_align = atoll (align_info_str -> string );
1370+ OBJ_RELEASE (align_info_str );
1371+ if (OPAL_ALIGN_MIN < tmp_align ) {
1372+ module -> memory_alignment = tmp_align ;
1373+ }
1374+ }
1375+ }
13461376
13471377 /* set the module so we properly cleanup */
13481378 win -> w_osc_module = (ompi_osc_base_module_t * ) module ;
@@ -1415,15 +1445,16 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
14151445 } else {
14161446 module -> state_size += mca_osc_rdma_component .max_attach * module -> region_size ;
14171447 }
1418- /*
1419- * These are the info's that this module is interested in
1420- */
1448+
1449+ /*
1450+ * These are the info's that this module is interested in
1451+ */
14211452 opal_infosubscribe_subscribe (& win -> super , "no_locks" , "false" , ompi_osc_rdma_set_no_lock_info );
14221453
1423- /*
1424- * TODO: same_size, same_disp_unit have w_flag entries, but do not appear
1425- * to be used anywhere. If that changes, they should be subscribed
1426- */
1454+ /*
1455+ * TODO: same_size, same_disp_unit have w_flag entries, but do not appear
1456+ * to be used anywhere. If that changes, they should be subscribed
1457+ */
14271458
14281459 /* fill in the function pointer part */
14291460 memcpy (& module -> super , & ompi_osc_rdma_module_rdma_template , sizeof (module -> super ));
@@ -1541,8 +1572,8 @@ ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, cons
15411572 }
15421573 /* enforce collectiveness... */
15431574 module -> comm -> c_coll -> coll_barrier (module -> comm , module -> comm -> c_coll -> coll_barrier_module );
1544- /*
1545- * Accept any value
1546- */
1575+ /*
1576+ * Accept any value
1577+ */
15471578 return module -> no_locks ? "true" : "false" ;
15481579}
0 commit comments