@@ -1907,22 +1907,22 @@ backend_nomad() {
19071907 while \
19081908 ! test -f " ${dir} " /flag/cluster-stopping \
19091909 && \
1910- backend_nomad is-task-program-running " ${dir} " " node-${pool_ix} " " node-${pool_ix} " > /dev/null
1910+ backend_nomad is-task-program-running " ${dir} " " node-${pool_ix} " " node-${pool_ix} " 5 > /dev/null
19111911 do
19121912 # Always check that a started generator has not FAILED!
19131913 if \
19141914 test -f " ${dir} " /generator/started \
19151915 && \
19161916 ! test -f " ${dir} " /generator/quit \
19171917 && \
1918- ! backend_nomad is-task-program-running " ${dir} " " ${generator_task} " generator
1918+ ! backend_nomad is-task-program-running " ${dir} " " ${generator_task} " generator 5
19191919 then
1920- if backend_nomad is-task-program-failed " ${dir} " " ${generator_task} " generator
1920+ if backend_nomad is-task-program-failed " ${dir} " " ${generator_task} " generator 5
19211921 then
19221922 # If the node in "${generator_task}" quits generators fails with:
19231923 # tx-generator: MuxError MuxBearerClosed "<socket: 12> closed when reading data, waiting on next header True"
19241924 # Service binary 'tx-generator' returned status: 1
1925- if backend_nomad is-task-program-running " ${dir} " " ${generator_task} " " ${generator_task} "
1925+ if backend_nomad is-task-program-running " ${dir} " " ${generator_task} " " ${generator_task} " 5
19261926 then
19271927 # This was not expected!
19281928 # But check it wasn't a race condition of a stopping cluster!
@@ -2457,11 +2457,13 @@ backend_nomad() {
24572457 backend_nomad task-supervisorctl " ${dir} " " ${task} " stop " ${program} " > /dev/null
24582458 ;;
24592459
2460+ # Don't use fatal with no strikes, the exit trap uses it to stop everything!
24602461 is-task-program-running )
24612462 local usage=" USAGE: wb backend pass $op RUN-DIR TASK-NAME SUPERVISOR-PROGRAM"
24622463 local dir=${1:? $usage } ; shift
24632464 local task=${1:? $usage } ; shift
24642465 local program=${1:? $usage } ; shift
2466+ local strikes=${1:- " " }
24652467 # NOTICE: Only returns zero when RUNNING!
24662468 # > supervisorctl status
24672469 # generator RUNNING pid 83, uptime 0:00:23
@@ -2483,24 +2485,108 @@ backend_nomad() {
24832485 # 3
24842486 # > supervisorctl status node-0 >/dev/null; echo $?
24852487 # 3
2486- backend_nomad task-supervisorctl " $dir " " $task " status " $program " > /dev/null
2488+ local stderr_file=" ${dir} " /flag/is-task-program-running-" ${task} " -" ${program} "
2489+ :> " ${stderr_file} "
2490+ if ! backend_nomad task-supervisorctl " ${dir} " " ${task} " status " ${program} " > /dev/null 2> " ${stderr_file} "
2491+ then
2492+ # Command returned "false"
2493+ if test -s " ${stderr_file} "
2494+ then
2495+ # Command returned "false" with a non-empty stderr output
2496+ if test -n " ${strikes} "
2497+ then
2498+ # A strike parameter was given
2499+ msg " $( yellow " Function \" is-task-program-running\" failed: $( cat ${stderr_file} ) " ) "
2500+ strikes=$(( strikes - 1 ))
2501+ msg " $( yellow " Strikes for \" is-task-program-running\" left: ${strikes} " ) "
2502+ if test " ${strikes} " -gt 0
2503+ then
2504+ # Strikes still available, sleep/retry!
2505+ if test " ${strikes} " = 1
2506+ then
2507+ # Before the last retry, wait five minute!
2508+ sleep 300 # 5 minutes!
2509+ else
2510+ sleep 60 # 1 minute!
2511+ fi
2512+ # Retry with one less strike available
2513+ backend_nomad is-task-program-running " ${dir} " " ${task} " " ${program} " " ${strikes} "
2514+ else
2515+ # Fails everything only if using strikes!
2516+ fatal " Function \" is-task-program-running\" failed: $( cat ${stderr_file} ) "
2517+ fi
2518+ else
2519+ # No strike parameter was given, don't use "fatal"!
2520+ msg " $( red " Function \" is-task-program-running\" failed: $( cat ${stderr_file} ) " ) "
2521+ false
2522+ fi
2523+ else
2524+ # Command returned "false" with an empty stderr output
2525+ false # Program is not running!
2526+ fi
2527+ else
2528+ # Command returned "true"
2529+ if test -s " ${stderr_file} "
2530+ then
2531+ # Don't supress possible error messages!
2532+ msg " $( yellow " WARNING: \" is-task-program-running\" is returning a non-empty stderr: $( cat " ${stderr_file} " ) " ) "
2533+ fi
2534+ true # Program is running!
2535+ fi
24872536 ;;
24882537
2538+ # Don't use fatal with no strikes, the exit trap uses it to stop everything!
24892539 is-task-program-failed )
24902540 local usage=" USAGE: wb backend pass $op RUN-DIR TASK-NAME SUPERVISOR-PROGRAM"
24912541 local dir=${1:? $usage } ; shift
24922542 local task=${1:? $usage } ; shift
24932543 local program=${1:? $usage } ; shift
2544+ local strikes=${1:- " " }
24942545 # As we are not using any "autorestart" supervisord programs are run as:
24952546 # command=sh -c "./start.sh; echo "$?" > ./exit_code"
24962547 # because we can't obtain the exit codes using `supervisrctl`
2548+ local stderr_file=" ${dir} " /flag/is-task-program-failed-" ${task} " -" ${program} "
2549+ :> " ${stderr_file} "
24972550 local exit_code
2498- if exit_code=$( backend_nomad task-file-contents " ${dir} " " ${task} " \
2499- /local/run/current/" ${program} " /exit_code 2> /dev/null )
2551+ if ! exit_code=$( backend_nomad task-file-contents " ${dir} " " ${task} " \
2552+ /local/run/current/" ${program} " /exit_code 2> " ${stderr_file} " )
25002553 then
2501- test " ${exit_code} " ! = " 0"
2554+ # Command returned "false"
2555+ if test -n " ${strikes} "
2556+ then
2557+ # A strike parameter was given
2558+ msg " $( yellow " Function \" is-task-program-failed\" failed: $( cat ${stderr_file} ) " ) "
2559+ strikes=$(( strikes - 1 ))
2560+ msg " $( yellow " Strikes for \" is-task-program-failed\" left: ${strikes} " ) "
2561+ if test " ${strikes} " -gt 0
2562+ then
2563+ # Strikes still available, sleep/retry!
2564+ if test " ${strikes} " = 1
2565+ then
2566+ # Before the last retry, wait five minute!
2567+ sleep 300 # 5 minutes!
2568+ else
2569+ sleep 60 # 1 minute!
2570+ fi
2571+ # Retry with one less strike available
2572+ backend_nomad is-task-program-failed " ${dir} " " ${task} " " ${program} " $(( strikes - 1 ))
2573+ else
2574+ # Fails everything only if using strikes!
2575+ fatal " Function \" is-task-program-failed\" failed"
2576+ fi
2577+ else
2578+ # No strike parameter was given, don't use "fatal"!
2579+ msg " $( red " Function \" is-task-program-failed\" failed: $( cat ${stderr_file} ) " ) "
2580+ true # Assuming program failed due to Nomad command error!
2581+ fi
25022582 else
2503- return 0
2583+ # Command returned "true"
2584+ if test -s " ${stderr_file} "
2585+ then
2586+ # Don't supress possible error messages!
2587+ msg " $( yellow " WARNING: \" is-task-program-failed\" is returning a non-empty stderr: $( cat ${stderr_file} ) " ) "
2588+ fi
2589+ test " ${exit_code} " ! = " 0"
25042590 fi
25052591 ;;
25062592
0 commit comments