@@ -443,27 +443,32 @@ static void proc_errors(int fd, short args, void *cbdata)
443443 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc)));
444444 /* record the first one to fail */
445445 if (!ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_ABORTED)) {
446- /* output an error message so the user knows what happened */
447- orte_show_help("help-errmgr-base.txt", "node-died", true,
448- ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
449- orte_process_info.nodename,
450- ORTE_NAME_PRINT(proc),
451- pptr->node->name);
452446 /* mark the daemon job as failed */
453447 jdata->state = ORTE_JOB_STATE_COMM_FAILED;
454448 /* point to the lowest rank to cause the problem */
455449 orte_set_attribute(&jdata->attributes, ORTE_JOB_ABORTED_PROC, ORTE_ATTR_LOCAL, pptr, OPAL_PTR);
456450 /* retain the object so it doesn't get free'd */
457451 OBJ_RETAIN(pptr);
458452 ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_ABORTED);
459- /* update our exit code */
460- ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
461- /* just in case the exit code hadn't been set, do it here - this
462- * won't override any reported exit code */
463- ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
453+ if (!orte_enable_recovery) {
454+ /* output an error message so the user knows what happened */
455+ orte_show_help("help-errmgr-base.txt", "node-died", true,
456+ ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
457+ orte_process_info.nodename,
458+ ORTE_NAME_PRINT(proc),
459+ pptr->node->name);
460+ /* update our exit code */
461+ ORTE_UPDATE_EXIT_STATUS(pptr->exit_code);
462+ /* just in case the exit code hadn't been set, do it here - this
463+ * won't override any reported exit code */
464+ ORTE_UPDATE_EXIT_STATUS(ORTE_ERR_COMM_FAILURE);
465+ }
466+ }
467+ /* if recovery is enabled, then we are done - otherwise,
468+ * abort the system */
469+ if (!orte_enable_recovery) {
470+ default_hnp_abort(jdata);
464471 }
465- /* abort the system */
466- default_hnp_abort(jdata);
467472 goto cleanup;
468473 }
469474
@@ -498,7 +503,8 @@ static void proc_errors(int fd, short args, void *cbdata)
498503 keep_going:
499504 /* if this is a continuously operating job, then there is nothing more
500505 * to do - we let the job continue to run */
501- if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL)) {
506+ if (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) ||
507+ ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE)) {
502508 /* always mark the waitpid as having fired */
503509 ORTE_ACTIVATE_PROC_STATE(&pptr->name, ORTE_PROC_STATE_WAITPID_FIRED);
504510 /* if this is a remote proc, we won't hear anything more about it
0 commit comments