55 * Copyright (c) 2004-2017 The University of Tennessee and The University
66 * of Tennessee Research Foundation. All rights
77 * reserved.
8- * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
8+ * Copyright (c) 2004-2020 High Performance Computing Center Stuttgart,
99 * University of Stuttgart. All rights reserved.
1010 * Copyright (c) 2004-2005 The Regents of the University of California.
1111 * All rights reserved.
5454#include "opal/mca/common/cuda/common_cuda.h"
5555#endif /* OPAL_CUDA_SUPPORT */
5656#include "opal/util/info_subscriber.h"
57+ #include "opal/mca/mpool/base/base.h"
5758
5859#include "ompi/info/info.h"
5960#include "ompi/communicator/communicator.h"
@@ -305,6 +306,16 @@ static int ompi_osc_rdma_component_register (void)
305306 MCA_BASE_VAR_TYPE_UNSIGNED_LONG , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
306307 MCA_BASE_VAR_SCOPE_LOCAL , & mca_osc_rdma_component .network_amo_max_count );
307308
309+ mca_osc_rdma_component .memory_alignment = opal_getpagesize ();
310+ opal_asprintf (& description_str , "The minimum memory alignment used to allocate local window memory (default: %zu). "
311+ "This is a best effort approach. Alignments larger than the page size may not be supported." ,
312+ mca_osc_rdma_component .memory_alignment );
313+ (void ) mca_base_component_var_register (& mca_osc_rdma_component .super .osc_version , "minimum_memory_alignment" ,
314+ description_str ,
315+ MCA_BASE_VAR_TYPE_SIZE_T , NULL , 0 , 0 , OPAL_INFO_LVL_3 ,
316+ MCA_BASE_VAR_SCOPE_READONLY , & mca_osc_rdma_component .memory_alignment );
317+ free (description_str );
318+
308319 /* register performance variables */
309320
310321 (void ) mca_base_component_pvar_register (& mca_osc_rdma_component .super .osc_version , "put_retry_count" ,
@@ -390,7 +401,7 @@ static int ompi_osc_rdma_component_query (struct ompi_win_t *win, void **base, s
390401{
391402
392403 if (MPI_WIN_FLAVOR_SHARED == flavor ) {
393- return -1 ;
404+ return OMPI_ERR_RMA_SHARED ;
394405 }
395406
396407#if OPAL_CUDA_SUPPORT
@@ -448,9 +459,10 @@ static int ompi_osc_rdma_initialize_region (ompi_osc_rdma_module_t *module, void
448459
449460static int allocate_state_single (ompi_osc_rdma_module_t * module , void * * base , size_t size )
450461{
451- size_t total_size , local_rank_array_size , leader_peer_data_size ;
462+ size_t total_size , local_rank_array_size , leader_peer_data_size , base_data_size ;
452463 ompi_osc_rdma_peer_t * my_peer ;
453464 int ret , my_rank ;
465+ size_t memory_alignment = module -> memory_alignment ;
454466
455467 OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_TRACE , "allocating private internal state" );
456468
@@ -463,32 +475,34 @@ static int allocate_state_single (ompi_osc_rdma_module_t *module, void **base, s
463475 * registration handles needed to access this data. */
464476 total_size = local_rank_array_size + module -> region_size +
465477 module -> state_size + leader_peer_data_size ;
466- total_size += OPAL_ALIGN_PAD_AMOUNT ( total_size , OPAL_ALIGN_MIN ) ;
478+ base_data_size = total_size ;
467479
468480 if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
469- total_size += size ;
481+ base_data_size += OPAL_ALIGN_PAD_AMOUNT (base_data_size , memory_alignment );
482+ total_size = base_data_size + size ;
470483 }
471484
472485 /* the local data is ordered as follows: rank array (leader, offset mapping), state, leader peer data, and base
473486 * (if using MPI_Win_allocate). In this case the leader peer data array does not need to be stored in the same
474487 * segment but placing it there simplifies the peer data fetch and cleanup code. */
475488
476- module -> rank_array = calloc (total_size , 1 );
489+ module -> rank_array = mca_mpool_base_default_module -> mpool_alloc (mca_mpool_base_default_module , total_size ,
490+ memory_alignment , 0 );
477491 if (OPAL_UNLIKELY (NULL == module -> rank_array )) {
478492 return OMPI_ERR_OUT_OF_RESOURCE ;
479493 }
480494
481- // Note, the extra module->region_size space added after local_rank_array_size
482- // is unused but is there to match what happens in allocte_state_shared()
483- // This allows module->state_offset to be uniform across the ranks which
484- // is part of how they pull peer info from each other.
495+ /* Note, the extra module->region_size space added after local_rank_array_size
496+ * is unused but is there to match what happens in allocte_state_shared()
497+ * This allows module->state_offset to be uniform across the ranks which
498+ * is part of how they pull peer info from each other. */
485499 module -> state_offset = local_rank_array_size + module -> region_size ;
486500
487501 module -> state = (ompi_osc_rdma_state_t * ) ((intptr_t ) module -> rank_array + module -> state_offset );
488502 module -> node_comm_info = (unsigned char * ) ((intptr_t ) module -> state + module -> state_size );
489503
490504 if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
491- * base = (void * ) ((intptr_t ) module -> node_comm_info + leader_peer_data_size );
505+ * base = (void * ) ((intptr_t ) module -> rank_array + base_data_size );
492506 }
493507
494508 /* just go ahead and register the whole segment */
@@ -583,7 +597,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
583597 ompi_osc_rdma_region_t * state_region ;
584598 struct _local_data * temp ;
585599 char * data_file ;
586- int page_size = opal_getpagesize () ;
600+ size_t memory_alignment = module -> memory_alignment ;
587601
588602 shared_comm = module -> shared_comm ;
589603
@@ -616,8 +630,8 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
616630
617631 /* ensure proper alignment */
618632 if (MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
619- data_base += OPAL_ALIGN_PAD_AMOUNT (data_base , page_size );
620- size += OPAL_ALIGN_PAD_AMOUNT (size , page_size );
633+ data_base += OPAL_ALIGN_PAD_AMOUNT (data_base , memory_alignment );
634+ size += OPAL_ALIGN_PAD_AMOUNT (size , memory_alignment );
621635 }
622636
623637 do {
@@ -645,6 +659,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
645659 my_base_offset = total_size ;
646660 }
647661 total_size += temp [i ].size ;
662+ total_size += OPAL_ALIGN_PAD_AMOUNT (total_size , memory_alignment );
648663 }
649664 }
650665
@@ -656,12 +671,12 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
656671 if (0 > ret ) {
657672 ret = OMPI_ERR_OUT_OF_RESOURCE ;
658673 } else {
659- /* allocate enough space for the state + data for all local ranks */
660- ret = opal_shmem_segment_create (& module -> seg_ds , data_file , total_size );
661- free (data_file );
662- if (OPAL_SUCCESS != ret ) {
663- OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to create shared memory segment" );
664- }
674+ /* allocate enough space for the state + data for all local ranks */
675+ ret = opal_shmem_segment_create (& module -> seg_ds , data_file , total_size );
676+ free (data_file );
677+ if (OPAL_SUCCESS != ret ) {
678+ OSC_RDMA_VERBOSE (MCA_BASE_VERBOSE_ERROR , "failed to create shared memory segment" );
679+ }
665680 }
666681 }
667682
@@ -688,6 +703,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
688703 }
689704
690705 if (size && MPI_WIN_FLAVOR_ALLOCATE == module -> flavor ) {
706+ size_t page_size = opal_getpagesize ();
691707 char * baseptr = (char * )((intptr_t ) module -> segment_base + my_base_offset );
692708 * base = (void * )baseptr ;
693709 // touch each page to force allocation on local NUMA node
@@ -794,7 +810,7 @@ static int allocate_state_shared (ompi_osc_rdma_module_t *module, void **base, s
794810 }
795811
796812 if (my_rank == peer_rank ) {
797- module -> my_peer = peer ;
813+ module -> my_peer = peer ;
798814 }
799815
800816 if (MPI_WIN_FLAVOR_DYNAMIC != module -> flavor && MPI_WIN_FLAVOR_CREATE != module -> flavor &&
@@ -854,12 +870,12 @@ static int ompi_osc_rdma_query_mtls (void)
854870
855871 mtls_to_use = opal_argv_split (ompi_osc_rdma_mtl_names , ',' );
856872 if (mtls_to_use && ompi_mtl_base_selected_component ) {
857- for (int i = 0 ; mtls_to_use [i ] ; ++ i ) {
858- if (0 == strcmp (mtls_to_use [i ], ompi_mtl_base_selected_component -> mtl_version .mca_component_name )) {
859- opal_argv_free (mtls_to_use );
860- return OMPI_SUCCESS ;
861- }
862- }
873+ for (int i = 0 ; mtls_to_use [i ] ; ++ i ) {
874+ if (0 == strcmp (mtls_to_use [i ], ompi_mtl_base_selected_component -> mtl_version .mca_component_name )) {
875+ opal_argv_free (mtls_to_use );
876+ return OMPI_SUCCESS ;
877+ }
878+ }
863879 }
864880 opal_argv_free (mtls_to_use );
865881 return -1 ;
@@ -1304,6 +1320,8 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
13041320 int world_size = ompi_comm_size (comm );
13051321 int init_limit = 256 ;
13061322 int ret ;
1323+ int flag ;
1324+ char infoval [32 ];
13071325 char * name ;
13081326
13091327 /* the osc/sm component is the exclusive provider for support for shared
@@ -1342,6 +1360,18 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
13421360 module -> win = win ;
13431361 module -> disp_unit = disp_unit ;
13441362 module -> size = size ;
1363+ module -> memory_alignment = mca_osc_rdma_component .memory_alignment ;
1364+ if (NULL != info ) {
1365+ opal_cstring_t * align_info_str ;
1366+ opal_info_get (info , "mpi_minimum_memory_alignment" , & align_info_str , & flag );
1367+ if (flag ) {
1368+ ssize_t tmp_align = atoll (align_info_str -> string );
1369+ OBJ_RELEASE (align_info_str );
1370+ if (OPAL_ALIGN_MIN < tmp_align ) {
1371+ module -> memory_alignment = tmp_align ;
1372+ }
1373+ }
1374+ }
13451375
13461376 /* set the module so we properly cleanup */
13471377 win -> w_osc_module = (ompi_osc_base_module_t * ) module ;
@@ -1414,15 +1444,16 @@ static int ompi_osc_rdma_component_select (struct ompi_win_t *win, void **base,
14141444 } else {
14151445 module -> state_size += mca_osc_rdma_component .max_attach * module -> region_size ;
14161446 }
1417- /*
1418- * These are the info's that this module is interested in
1419- */
1447+
1448+ /*
1449+ * These are the info's that this module is interested in
1450+ */
14201451 opal_infosubscribe_subscribe (& win -> super , "no_locks" , "false" , ompi_osc_rdma_set_no_lock_info );
14211452
1422- /*
1423- * TODO: same_size, same_disp_unit have w_flag entries, but do not appear
1424- * to be used anywhere. If that changes, they should be subscribed
1425- */
1453+ /*
1454+ * TODO: same_size, same_disp_unit have w_flag entries, but do not appear
1455+ * to be used anywhere. If that changes, they should be subscribed
1456+ */
14261457
14271458 /* fill in the function pointer part */
14281459 memcpy (& module -> super , & ompi_osc_rdma_module_rdma_template , sizeof (module -> super ));
@@ -1540,8 +1571,8 @@ ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, cons
15401571 }
15411572 /* enforce collectiveness... */
15421573 module -> comm -> c_coll -> coll_barrier (module -> comm , module -> comm -> c_coll -> coll_barrier_module );
1543- /*
1544- * Accept any value
1545- */
1574+ /*
1575+ * Accept any value
1576+ */
15461577 return module -> no_locks ? "true" : "false" ;
15471578}
0 commit comments