Skip to content

Commit ed9766d

Browse files
Expand Fenix config options
1 parent e5ed43e commit ed9766d

File tree

12 files changed

+152
-204
lines changed

12 files changed

+152
-204
lines changed

include/fenix.h

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,30 @@ typedef enum {
146146
FENIX_ROLE_SURVIVOR_RANK = 2
147147
} Fenix_Rank_role;
148148

149+
/**
150+
* @brief Options for passing control back to application after recovery.
151+
*/
152+
typedef enum {
153+
//!Return to Fenix_Init via longjmp (default)
154+
JUMP,
155+
//!Return the error code inline
156+
RETURN,
157+
//!Throw a Fenix::CommException
158+
THROW
159+
} Fenix_Resume_mode;
160+
161+
/**
162+
* @brief Options for dealing with 'unhandled' errors, e.g. invalid rank IDs
163+
*/
164+
typedef enum {
165+
//!Ignore unhandled errors
166+
SILENT,
167+
//!Print error and continue without handling
168+
PRINT,
169+
//!Print error and abort Fenix's world (default)
170+
ABORT
171+
} Fenix_Unhandled_mode;
172+
149173
/**
150174
* @fn void Fenix_Init(int* role, MPI_Comm comm, MPI_Comm* newcomm, int** argc, char*** argv, int spare_ranks, int spawn, MPI_Info info, int* error);
151175
* @brief Build a resilient communicator and set the restart point.
@@ -197,14 +221,13 @@ typedef enum {
197221
* @param[in] spawn *Unimplemented*: Whether to enable spawning new ranks to replace
198222
* failed ranks when spares are unavailable.
199223
* @param[in] info Fenix recovery configuration parameters, may be MPI_INFO_NULL
200-
* Supports the "FENIX_RESUME_MODE" key, used to indicate where execution should resume upon
224+
* "FENIX_RESUME_MODE" key is used to indicate where execution should resume upon
201225
* rank failure for all active (non-spare) ranks in any resilient communicators, not only for
202-
* those ranks in communicators that failed. The following values associated with the
203-
* "resume_mode" key are supported:
204-
* - "Fenix_init" (default): execution resumes at logical exit of Fenix_Init.
205-
* - "NO_JUMP": execution continues from the failing MPI call. Errors are otherwise handled
206-
* as normal, but return the error code as well. Applications should typically
207-
* either check for return codes or assign an error callback through Fenix.
226+
* those ranks in communicators that failed. The value should be a string with the name of a
227+
* Fenix_Resume_mode enum value.
228+
* "FENIX_UNHANDLED_MODE" key is used to indicate how Fenix should handle error values
229+
* returned by MPI functions that are unrelated to failed processes. The value should be
230+
* a string with the name of a Fenix_Unhandled_mode enum value.
208231
* @param[out] error The return status of \c Fenix_Init<br>
209232
* Used to signal that a non-fatal error or special condition was encountered in the execution of
210233
* Fenix_Init, or FENIX_SUCCESS otherwise. It has the same value across all ranks released by
@@ -221,10 +244,8 @@ typedef enum {
221244
*(_role) = __fenix_preinit(_role, _comm, _newcomm, _argc, \
222245
_argv, _spare_ranks, _spawn, _info, \
223246
_error, &bufjmp); \
224-
if(setjmp(bufjmp)) { \
225-
*(_role) = FENIX_ROLE_SURVIVOR_RANK; \
226-
} \
227-
__fenix_postinit( _error ); \
247+
setjmp(bufjmp); \
248+
__fenix_postinit(); \
228249
}
229250

230251

include/fenix.hpp

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -72,14 +72,4 @@
7272
*/
7373
int Fenix_Callback_register(std::function<void(MPI_Comm, int)> callback);
7474

75-
/**
76-
* @brief Registers a callback that throws a CommException
77-
*
78-
* This means no longjmp will occur, and instead applications
79-
* will continue from their try-catch error handler.
80-
*
81-
* @returnstatus
82-
*/
83-
int register_exception_callback();
84-
8575
#endif

include/fenix_ext.hpp

Lines changed: 26 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -66,49 +66,44 @@
6666
typedef struct __fenix_data_recovery fenix_data_recovery_t;
6767

6868
typedef struct {
69-
int num_inital_ranks; // Keeps the global MPI rank ID at Fenix_init
70-
int num_survivor_ranks; // Keeps the global information on the number of survived MPI ranks after failure
71-
int num_recovered_ranks; // Keeps the number of spare ranks brought into MPI communicator recovery
72-
int resume_mode; // Defines how program resumes after process recovery
73-
int spawn_policy; // Indicate dynamic process spawning
74-
int spare_ranks; // Spare ranks entered by user to repair failed ranks
75-
int repair_result; // Internal global variable to store the result of MPI communicator repair
76-
int finalized;
69+
int num_inital_ranks; // Keeps the global MPI rank ID at Fenix_init
70+
int num_survivor_ranks = 0; // Keeps the global information on the number of survived MPI ranks after failure
71+
int num_recovered_ranks = 0; // Keeps the number of spare ranks brought into MPI communicator recovery
72+
int spare_ranks; // Spare ranks entered by user to repair failed ranks
73+
74+
int resume_mode = Fenix_Resume_mode::JUMP;
75+
int unhandled_mode = Fenix_Unhandled_mode::ABORT;
76+
int ignore_errs = false; // Temporarily ignore all errors & recovery
77+
int spawn_policy; // Indicate dynamic process spawning
7778
jmp_buf *recover_environment; // Calling environment to fill the jmp_buf structure
7879

80+
int repair_result = FENIX_SUCCESS; // Internal variable to store the result of MPI comm repair
81+
int role = FENIX_ROLE_INITIAL_RANK;
7982

80-
//enum FenixRankRole role; // Role of rank: initial, survivor or repair
81-
int role; // Role of rank: initial, survivor or repair
82-
int fenix_init_flag = 0;
83+
int fenix_init_flag = false;
84+
int finalized = false;
8385

84-
int fail_world_size;
85-
int* fail_world;
86+
int fail_world_size = 0;
87+
int* fail_world = nullptr;
8688

8789
//Save the pointer to role and error of Fenix_Init
88-
int *ret_role;
89-
int *ret_error;
90+
int *ret_role = nullptr;
91+
int *ret_error = nullptr;
9092

9193
std::vector<fenix_callback_func> callbacks;
92-
fenix_debug_opt_t options; // This is reserved to store the user options
94+
fenix_debug_opt_t options; // This is reserved to store the user options
9395

94-
MPI_Comm *world; // Duplicate of the MPI communicator provided by user
95-
MPI_Comm new_world; // Global MPI communicator identical to g_world but without spare ranks
96-
MPI_Comm *user_world; // MPI communicator with repaired ranks
97-
//Manage state of the comms. Necessary when failures happen rapidly, mussing up state
98-
int new_world_exists, user_world_exists;
99-
96+
MPI_Comm *world; // Duplicate of comm provided by user
97+
MPI_Comm *user_world; // User-facing comm with repaired ranks and no spares
98+
MPI_Comm new_world; // Internal duplicate of user_world
99+
int new_world_exists = false, user_world_exists = false;
100+
101+
//Values used for Fenix_Process_detect_failures
100102
int dummy_recv_buffer;
101103
MPI_Request check_failures_req;
102104

103-
104-
MPI_Op agree_op; // This is reserved for the global agreement call for Fenix data recovery API
105-
106-
107-
MPI_Errhandler mpi_errhandler; // This stores callback info for our custom error handler
108-
int ignore_errs; // Set this to return errors instead of using the error handler normally. (Don't forget to unset!)
109-
int print_unhandled; // Set this to print the error string for MPI errors of an unhandled return type.
110-
111-
105+
MPI_Op agree_op; // Global agreement call for Fenix data recovery API
106+
MPI_Errhandler mpi_errhandler; // Our custom error handler
112107

113108
fenix_data_recovery_t *data_recovery; // Global pointer for Fenix Data Recovery Data Structure
114109
} fenix_t;

include/fenix_init.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ extern "C" {
6767
int __fenix_preinit(int *, MPI_Comm, MPI_Comm *, int *, char ***, int, int, MPI_Info, int *, jmp_buf *);
6868

6969

70-
void __fenix_postinit(int *);
70+
void __fenix_postinit();
7171

7272
#if defined(c_plusplus) || defined(__cplusplus)
7373
}

include/fenix_opt.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@
7878
do { printf("%s(): " fmt, __func__, __VA_ARGS__); } while (0)
7979

8080
typedef struct __fenix_debug_opt_t {
81-
int verbose;
81+
int verbose = -1;
8282
} fenix_debug_opt_t;
8383

8484

include/fenix_process_recovery.hpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,13 +65,11 @@
6565
#include <stdarg.h>
6666
#include <stdint.h>
6767
#include <signal.h>
68+
#include <string_view>
6869

6970
#include "fenix_init.h"
7071
#include <functional>
7172

72-
#define __FENIX_RESUME_AT_INIT 0
73-
#define __FENIX_RESUME_NO_JUMP 200
74-
7573
using fenix_callback_func = std::function<void(MPI_Comm, int)>;
7674

7775
typedef struct __fenix_comm_list_elm {
@@ -85,6 +83,10 @@ typedef struct {
8583
fenix_comm_list_elm_t *tail;
8684
} fenix_comm_list_t;
8785

86+
void __fenix_set_resume_mode(const std::string_view& name);
87+
88+
void __fenix_set_unhandled_mode(const std::string_view& name);
89+
8890
int __fenix_create_new_world();
8991

9092
int __fenix_repair_ranks();

src/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ FILE(GLOB Fenix_HEADERS ${CMAKE_SOURCE_DIR}/include/*.h*)
1616

1717
set (Fenix_SOURCES
1818
fenix.cpp
19-
fenix_exception.cpp
2019
fenix_opt.cpp
2120
fenix_process_recovery.cpp
2221
fenix_util.cpp

src/fenix_exception.cpp

Lines changed: 0 additions & 12 deletions
This file was deleted.

0 commit comments

Comments
 (0)