22 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
33 * University Research and Technology
44 * Corporation. All rights reserved.
5- * Copyright (c) 2004-2014 The University of Tennessee and The University
5+ * Copyright (c) 2004-2020 The University of Tennessee and The University
66 * of Tennessee Research Foundation. All rights
77 * reserved.
88 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
4747/*
4848 * Local functions
4949 */
50- static void backend_fatal ( char * type , struct ompi_communicator_t * comm ,
50+ static void backend_abort ( int fatal , char * type , struct ompi_communicator_t * comm ,
5151 char * name , int * error_code , va_list arglist );
5252static void out (char * str , char * arg );
5353
@@ -68,7 +68,7 @@ void ompi_mpi_errors_are_fatal_comm_handler(struct ompi_communicator_t **comm,
6868 name = NULL ;
6969 abort_comm = NULL ;
7070 }
71- backend_fatal ( "communicator" , abort_comm , name , error_code , arglist );
71+ backend_abort (true, "communicator" , abort_comm , name , error_code , arglist );
7272 va_end (arglist );
7373}
7474
@@ -89,7 +89,7 @@ void ompi_mpi_errors_are_fatal_file_handler(struct ompi_file_t **file,
8989 name = NULL ;
9090 abort_comm = NULL ;
9191 }
92- backend_fatal ( "file" , abort_comm , name , error_code , arglist );
92+ backend_abort (true, "file" , abort_comm , name , error_code , arglist );
9393 va_end (arglist );
9494}
9595
@@ -108,7 +108,67 @@ void ompi_mpi_errors_are_fatal_win_handler(struct ompi_win_t **win,
108108 } else {
109109 name = NULL ;
110110 }
111- backend_fatal ("win" , abort_comm , name , error_code , arglist );
111+ backend_abort (true, "win" , abort_comm , name , error_code , arglist );
112+ va_end (arglist );
113+ }
114+
115+ void ompi_mpi_errors_abort_comm_handler (struct ompi_communicator_t * * comm ,
116+ int * error_code , ...)
117+ {
118+ char * name ;
119+ struct ompi_communicator_t * abort_comm ;
120+ va_list arglist ;
121+
122+ va_start (arglist , error_code );
123+
124+ if ( (NULL != comm ) && (NULL != * comm ) ) {
125+ name = (* comm )-> c_name ;
126+ abort_comm = * comm ;
127+ } else {
128+ name = NULL ;
129+ abort_comm = NULL ;
130+ }
131+ backend_abort (false, "communicator" , abort_comm , name , error_code , arglist );
132+ va_end (arglist );
133+ }
134+
135+
136+ void ompi_mpi_errors_abort_file_handler (struct ompi_file_t * * file ,
137+ int * error_code , ...)
138+ {
139+ char * name ;
140+ struct ompi_communicator_t * abort_comm ;
141+ va_list arglist ;
142+
143+ va_start (arglist , error_code );
144+
145+ if (NULL != file ) {
146+ name = (* file )-> f_filename ;
147+ abort_comm = (* file )-> f_comm ;
148+ } else {
149+ name = NULL ;
150+ abort_comm = NULL ;
151+ }
152+ backend_abort (false, "file" , abort_comm , name , error_code , arglist );
153+ va_end (arglist );
154+ }
155+
156+
157+ void ompi_mpi_errors_abort_win_handler (struct ompi_win_t * * win ,
158+ int * error_code , ...)
159+ {
160+ char * name ;
161+ struct ompi_communicator_t * abort_comm = NULL ;
162+ va_list arglist ;
163+
164+ va_start (arglist , error_code );
165+
166+ if (NULL != win ) {
167+ name = (* win )-> w_name ;
168+ } else {
169+ name = NULL ;
170+ }
171+ backend_abort (false, "win" , abort_comm , name , error_code , arglist );
112172 va_end (arglist );
113173}
114174
@@ -175,7 +235,7 @@ static void out(char *str, char *arg)
175235 * there's no need to handle the pre-MPI_INIT and post-MPI_FINALIZE
176236 * errors here.
177237 */
178- static void backend_fatal_aggregate ( char * type ,
238+ static void backend_abort_aggregate ( int fatal , char * type ,
179239 struct ompi_communicator_t * comm ,
180240 char * name , int * error_code ,
181241 va_list arglist )
@@ -199,7 +259,7 @@ static void backend_fatal_aggregate(char *type,
199259 ompi_process_info .nodename ,
200260 (int ) ompi_process_info .pid ) == -1 ) {
201261 prefix = NULL ;
202- // non-fatal , we could still go on to give useful information here...
262+ // non-abort , we could still go on to give useful information here...
203263 opal_output (0 , "%s" , "Could not write node and PID to prefix" );
204264 opal_output (0 , "Node: %s" , ompi_process_info .nodename );
205265 opal_output (0 , "PID: %d" , (int ) ompi_process_info .pid );
@@ -224,7 +284,7 @@ static void backend_fatal_aggregate(char *type,
224284
225285 if (NULL != name ) {
226286 opal_show_help ("help-mpi-errors.txt" ,
227- "mpi_errors_are_fatal" ,
287+ fatal ? "mpi_errors_are_fatal" : "mpi_errors_abort " ,
228288 false,
229289 usable_prefix ,
230290 (NULL == arg ) ? "" : "in" ,
@@ -267,15 +327,15 @@ static void backend_fatal_aggregate(char *type,
267327
268328/*
269329 * Note that this function has to handle pre-MPI_INIT and
270- * post-MPI_FINALIZE errors, which backend_fatal_aggregate () does not
330+ * post-MPI_FINALIZE errors, which backend_abort_aggregate () does not
271331 * have to handle.
272332 *
273333 * This function also intentionally does not call malloc(), just in
274334 * case we're being called due to some kind of stack/memory error --
275335 * we *might* be able to get a message out if we're not further
276336 * corrupting the stack by calling malloc()...
277337 */
278- static void backend_fatal_no_aggregate ( char * type ,
338+ static void backend_abort_no_aggregate ( int fatal , char * type ,
279339 struct ompi_communicator_t * comm ,
280340 char * name , int * error_code ,
281341 va_list arglist )
@@ -303,7 +363,7 @@ static void backend_fatal_no_aggregate(char *type,
303363 "*** Unfortunately, no further information is available on *which* MPI\n"
304364 "*** function was invoked, sorry. :-(\n" , NULL );
305365 }
306- out ("*** Your MPI job will now abort.\n" , NULL );
366+ if ( fatal ) out ("*** Your MPI job will now abort.\n" , NULL );
307367 } else if (state >= OMPI_MPI_STATE_FINALIZE_PAST_COMM_SELF_DESTRUCT ) {
308368 if (NULL != arg ) {
309369 out ("*** The %s() function was called after MPI_FINALIZE was invoked.\n"
@@ -314,7 +374,7 @@ static void backend_fatal_no_aggregate(char *type,
314374 "*** Unfortunately, no further information is available on *which* MPI\n"
315375 "*** function was invoked, sorry. :-(\n" , NULL );
316376 }
317- out ("*** Your MPI job will now abort.\n" , NULL );
377+ if ( fatal ) out ("*** Your MPI job will now abort.\n" , NULL );
318378 }
319379
320380 else {
@@ -365,23 +425,30 @@ static void backend_fatal_no_aggregate(char *type,
365425 out ("*** Error code: %d (no associated error message)\n" , intbuf );
366426 }
367427 }
368- /* out("*** MPI_ERRORS_ARE_FATAL: your MPI job will now abort\n", NULL); */
369- out ("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n" , type );
370- out ("*** and potentially your MPI job)\n" , NULL );
371-
428+ /* out("*** MPI_ERRORS_ABORT: your MPI job will now abort\n", NULL); */
429+ if (fatal ) {
430+ out ("*** MPI_ERRORS_ARE_FATAL (processes in this %s will now abort,\n" , type );
431+ out ("*** and MPI will try to terminate your MPI job as well)\n" , NULL );
432+ }
433+ else {
434+ out ("*** MPI_ERRORS_ABORT (processes in this %s will now abort,\n" , type );
435+ out ("*** and potentially the rest of your MPI job)\n" , NULL );
436+ }
372437 }
373438 va_end (arglist );
374439}
375440
376- static void backend_fatal ( char * type , struct ompi_communicator_t * comm ,
441+ static void backend_abort ( int fatal , char * type , struct ompi_communicator_t * comm ,
377442 char * name , int * error_code ,
378443 va_list arglist )
379444{
445+ int err = MPI_ERR_UNKNOWN ;
446+
380447 /* We only want aggregation while the rte is initialized */
381448 if (ompi_rte_initialized ) {
382- backend_fatal_aggregate ( type , comm , name , error_code , arglist );
449+ backend_abort_aggregate ( fatal , type , comm , name , error_code , arglist );
383450 } else {
384- backend_fatal_no_aggregate ( type , comm , name , error_code , arglist );
451+ backend_abort_no_aggregate ( fatal , type , comm , name , error_code , arglist );
385452 }
386453
387454 /* In most instances the communicator will be valid. If not, we are either early in
@@ -392,9 +459,9 @@ static void backend_fatal(char *type, struct ompi_communicator_t *comm,
392459 comm = & ompi_mpi_comm_self .comm ;
393460 }
394461
395- if (NULL != error_code ) {
396- ompi_mpi_abort ( comm , * error_code ) ;
397- } else {
398- ompi_mpi_abort ( comm , 1 );
399- }
462+ if (NULL != error_code )
463+ err = * error_code ;
464+
465+ /* Call abort without a specified comm to force RTE Job termination */
466+ ompi_mpi_abort ( fatal ? NULL : comm , err );
400467}
0 commit comments