|
66 | 66 | typedef struct __fenix_data_recovery fenix_data_recovery_t; |
67 | 67 |
|
68 | 68 | typedef struct { |
69 | | - int num_inital_ranks; // Keeps the global MPI rank ID at Fenix_init |
70 | | - int num_survivor_ranks; // Keeps the global information on the number of survived MPI ranks after failure |
71 | | - int num_recovered_ranks; // Keeps the number of spare ranks brought into MPI communicator recovery |
72 | | - int resume_mode; // Defines how program resumes after process recovery |
73 | | - int spawn_policy; // Indicate dynamic process spawning |
74 | | - int spare_ranks; // Spare ranks entered by user to repair failed ranks |
75 | | - int repair_result; // Internal global variable to store the result of MPI communicator repair |
76 | | - int finalized; |
| 69 | + int num_inital_ranks; // Keeps the global MPI rank ID at Fenix_init |
| 70 | + int num_survivor_ranks = 0; // Keeps the global information on the number of survived MPI ranks after failure |
| 71 | + int num_recovered_ranks = 0; // Keeps the number of spare ranks brought into MPI communicator recovery |
| 72 | + int spare_ranks; // Spare ranks entered by user to repair failed ranks |
| 73 | + |
| 74 | + int resume_mode = Fenix_Resume_mode::JUMP; |
| 75 | + int unhandled_mode = Fenix_Unhandled_mode::ABORT; |
| 76 | + int ignore_errs = false; // Temporarily ignore all errors & recovery |
| 77 | + int spawn_policy; // Indicate dynamic process spawning |
77 | 78 | jmp_buf *recover_environment; // Calling environment to fill the jmp_buf structure |
78 | 79 |
|
| 80 | + int repair_result = FENIX_SUCCESS; // Internal variable to store the result of MPI comm repair |
| 81 | + int role = FENIX_ROLE_INITIAL_RANK; |
79 | 82 |
|
80 | | - //enum FenixRankRole role; // Role of rank: initial, survivor or repair |
81 | | - int role; // Role of rank: initial, survivor or repair |
82 | | - int fenix_init_flag = 0; |
| 83 | + int fenix_init_flag = false; |
| 84 | + int finalized = false; |
83 | 85 |
|
84 | | - int fail_world_size; |
85 | | - int* fail_world; |
| 86 | + int fail_world_size = 0; |
| 87 | + int* fail_world = nullptr; |
86 | 88 |
|
87 | 89 | //Save the pointer to role and error of Fenix_Init |
88 | | - int *ret_role; |
89 | | - int *ret_error; |
| 90 | + int *ret_role = nullptr; |
| 91 | + int *ret_error = nullptr; |
90 | 92 |
|
91 | 93 | std::vector<fenix_callback_func> callbacks; |
92 | | - fenix_debug_opt_t options; // This is reserved to store the user options |
| 94 | + fenix_debug_opt_t options; // This is reserved to store the user options |
93 | 95 |
|
94 | | - MPI_Comm *world; // Duplicate of the MPI communicator provided by user |
95 | | - MPI_Comm new_world; // Global MPI communicator identical to g_world but without spare ranks |
96 | | - MPI_Comm *user_world; // MPI communicator with repaired ranks |
97 | | - //Manage state of the comms. Necessary when failures happen rapidly, mussing up state |
98 | | - int new_world_exists, user_world_exists; |
99 | | - |
| 96 | + MPI_Comm *world; // Duplicate of comm provided by user |
| 97 | + MPI_Comm *user_world; // User-facing comm with repaired ranks and no spares |
| 98 | + MPI_Comm new_world; // Internal duplicate of user_world |
| 99 | + int new_world_exists = false, user_world_exists = false; |
| 100 | + |
| 101 | + //Values used for Fenix_Process_detect_failures |
100 | 102 | int dummy_recv_buffer; |
101 | 103 | MPI_Request check_failures_req; |
102 | 104 |
|
103 | | - |
104 | | - MPI_Op agree_op; // This is reserved for the global agreement call for Fenix data recovery API |
105 | | - |
106 | | - |
107 | | - MPI_Errhandler mpi_errhandler; // This stores callback info for our custom error handler |
108 | | - int ignore_errs; // Set this to return errors instead of using the error handler normally. (Don't forget to unset!) |
109 | | - int print_unhandled; // Set this to print the error string for MPI errors of an unhandled return type. |
110 | | - |
111 | | - |
| 105 | + MPI_Op agree_op; // Global agreement call for Fenix data recovery API |
| 106 | + MPI_Errhandler mpi_errhandler; // Our custom error handler |
112 | 107 |
|
113 | 108 | fenix_data_recovery_t *data_recovery; // Global pointer for Fenix Data Recovery Data Structure |
114 | 109 | } fenix_t; |
|
0 commit comments